Skip to content

Commit

Permalink
python & srt diff
Browse files Browse the repository at this point in the history
  • Loading branch information
ruanjiayou committed Dec 11, 2024
1 parent 1959f64 commit 509c785
Show file tree
Hide file tree
Showing 18 changed files with 749 additions and 146 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
/**/node_modules/
/**/node_modules/
test/
8 changes: 5 additions & 3 deletions ffmpeg/trans.sh
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
# ffmepg的docker形式
## alias
- alias
> 添加alias后可以快速使用,抽取音频: ff -i input.mp4 -vn output.mp3
- linux
> `alias ff="docker run -v $PWD:/data -w /data jrottenberg/ffmpeg:4.1-alpine -hide_banner $@"`
- powershell
> `function ff() { docker run -v /c/Users/Administrator/Downloads:/data -w /data --name temp jrottenberg/ffmpeg:4.1-alpine -hide_banner $args; docker rm temp }`
- 查看编码器
> -codecs -hide_banner |grep libmp3lame
# mp4转m3u8
-i /data/videos/-.mp4 -codec copy -vbsf h264_mp4toannexb -map 0 -f segment -segment_list /data/videos/m3u8/test.m3u8 -segment_time 5 /data/videos/m3u8/p_%03d.ts
# m3u8转mp4 应该先 protocal_whitelist 再 allowed_extions 接着 -i 不然可能出错
-i /data/videos/m3u8/test.m3u8 -allowed_extensions ALL -movflags faststart -protocol_whitelist file,tls,tcp,https,crypto -c copy /data/videos/mp4/test.mp4
# 元信息放文件开头 -movflags faststart
# 剪切视频
-i example.mp4 -ss 5m -t 10m output.mp4
# 剪切视频 从 17s 开始,剪切 25s 长度
-i example.mp4 -ss 00:17 -t 25 output.mp4
# 截图
-ss 5.1 -i /data/videos/-.mp4 -s 320x240 -frames:v 1 -f image2 /data/videos/screenshots/test.png
Expand Down
54 changes: 54 additions & 0 deletions nas/proxy.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
const fs = require("fs");
const path = require('path');
const os = require('os');
const net = require('net');
const shell = require('shelljs').exec;
function getLocalIP() {
return new Promise((resolve, reject) => {
const interfaces = os.networkInterfaces();
for (const name of Object.keys(interfaces)) {
for (const interface of interfaces[name]) {
if (interface.family === 'IPv4' && !interface.internal) {
resolve(interface.address);
return;
}
}
}
reject(new Error('No public IP found.'));
});
}

getLocalIP().then(ip => {
console.log(ip);
const appName = 'Lantern';
// const r1 = shell(`osascript -e 'quit app "${appName}"'`);
// console.log(r1.code, r1.stderr);
// const filepath = path.join(__dirname, "settings.yml");
const filepath = "/Users/jiayou/Library/Application\ Support/Lantern/settings.yaml"
const lines = fs.readFileSync(filepath).toString().split('\n');
const news = lines.map(line => {
if (line.startsWith('addr:')) {
return line.split(':')[0] + ': ' + ip + ':8899';
} else if (line.startsWith('socksAddr:')) {
return line.split(':')[0] + ': ' + ip + ':9988';
} else {
return line;
}
});
fs.writeFileSync(filepath, news.join('\n'));

const gitfilepath = '/Users/jiayou/.gitconfig';
const lines2 = fs.readFileSync(gitfilepath).toString().split('\n');
const news2 = lines2.map(line => {
if (line.trim().startsWith('proxy')) {
return ` proxy = ${ip}:8899`
} else {
return line;
}
});
fs.writeFileSync(gitfilepath, news2.join('\n'));
const r2 = shell('open /Applications/Lantern.app');
console.log(r2.code, r2.stderr);
console.log('finished');
})

1 change: 1 addition & 0 deletions nodejs/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.tmp/
130 changes: 46 additions & 84 deletions nodejs/diff-utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@ const path = require('path');
const { simplecc } = require('simplecc-wasm');
const { diffWordsWithSpace } = require('diff');

del('subtitles/step2-diff.txt');

// 标记符号
const SYMBOL = {
SPLIT: '{|}',
CONCAT: '{&}',
MODIFY: '}-+{',
DEL_ST: 'x{',
DEL_ED: '}x',
const F = {
SPLIT: '|',
CONCAT: '&',
MODIFY: '-+',
DEL_ST: '{',
DEL_ED: '}',
ADD: '+',
REM: '-',
}
// 相对路径转绝对路径
function rel(file) {
Expand All @@ -26,14 +26,14 @@ function del(fullpath) {

// 删除符号
function clear(str) {
return str.replace(/[.,/#!$%^&*;:{}=_`~()'"\[\]?<>\\|@+-]/g, '').replace(/(“)|(”)|(《)|(》)|(\s*)/g, '');
return str.replace(/[\p{P}\p{S}\s]/gu, '');
}

function pure_diff(txt) {
return txt.replace(/-\{(.+?)\}-/g, '').replace(/(\+\{)|(\}\+)/g, '')
return txt.replace(/-(.+?)-/g, '').replace(/\+/g, '')
}
function apply_diff(txt) {
return txt.replace(/\+\{(.+?)\}\+/g, '').replace(/(-\{|()\}-)/g, '')
return txt.replace(/\+(.+?)\+/g, '').replace(/-/g, '')
}

function read(filepath) {
Expand All @@ -47,24 +47,24 @@ function write(filepath, txt) {
function t2s(txt) {
return simplecc(txt, 't2s');
}
// 字幕时间字符串转毫秒数
// 字幕时间字符串转秒数
function t2n(srtTime) {
const [hours, minutes, seconds] = srtTime.split(":");
const [secs, millis] = seconds.split(",");

const h = parseInt(hours, 10) * 3600000; // 小时 -> 毫秒
const m = parseInt(minutes, 10) * 60000; // 分钟 -> 毫秒
const s = parseInt(secs, 10) * 1000; // 秒 -> 毫秒
const ms = parseInt(millis, 10); // 毫秒
const h = parseInt(hours, 10) * 3600; // 小时 -> 毫秒
const m = parseInt(minutes, 10) * 60; // 分钟 -> 毫秒
const s = parseInt(secs, 10); // 秒 -> 毫秒
const ms = parseInt(millis, 10) / 1000; // 毫秒

return h + m + s + ms;
}

function n2t(milliseconds) {
const hours = Math.floor(milliseconds / 3600000);
const minutes = Math.floor((milliseconds % 3600000) / 60000);
const seconds = Math.floor((milliseconds % 60000) / 1000);
const millis = milliseconds % 1000;
function n2t(seconds) {
const millis = Math.round((seconds % 1) * 1000);
const hours = Math.floor(seconds / 3600);
const minutes = Math.floor((seconds % 3600) / 60);
seconds = Math.floor((seconds % 60));

return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")},${String(millis).padStart(3, "0").substring(0, 3)}`;
}
Expand Down Expand Up @@ -98,46 +98,20 @@ module.exports = function txt2srt(document, srt_list) {
// 首尾换行交换
const start_br = part.value.startsWith('\n');
const end_br = part.value.endsWith('\n');
const txt = `${start_br ? '\n' : ''}x{${part.value.trim()}}x${end_br ? '\n' : ''}`;
const txt = `${start_br ? '\n' : ''}` + F.DEL_ST + part.value.trim() + F.DEL_ED + `${end_br ? '\n' : ''}`;
diffed_txt += txt;
// write('subtitles/step2-diff.txt', txt);
} else if (part.value === '\n') {
// 多了个换行就是被分割了需要合并
diffed_txt += '{&}\n'
// write('subtitles/step2-diff.txt', `{&}\n`);
} else if (part.value.includes('\n')) {
const is_start = part.value.startsWith('\n')
const txt = `${is_start && !diffs[nth - 1].value.endsWith('}-') ? '\n' : ''}+{${part.value.trim()}}+${is_start ? '' : '{&}\n'}`
const txt = part.value.split('\n').map(v => v !== '' ? F.ADD + v + F.ADD : '').join(F.CONCAT + '\n')
diffed_txt += txt;
// write('subtitles/step2-diff.txt', txt);
} else {
const txt = `+{${part.value}}+`;
const txt = `${F.ADD}${part.value}${F.ADD}`;
diffed_txt += txt;
// write('subtitles/step2-diff.txt', txt)
}
} else if (part.removed) {
// 有换行就需要分割时间区间,一般应该只有一个换行在开头
let real_value = part.value;
let txt = '';
if (part.value.startsWith('\n')) {
txt += '{|}'
real_value = part.value.replace(/^\n/, '')
if (real_value && !real_value.endsWith('\n')) {
txt += `-{${real_value}}-`;
real_value = '';
}
}
if (real_value.endsWith('\n')) {
real_value = real_value.replace(/\n$/, '')
if (real_value) {
txt += `-{${real_value}}-`
real_value = '';
}
txt += '{|}'
}
if (real_value) {
txt += `-{${real_value}}-`
}
const txt = part.value.split('\n').map(v => v !== '' ? F.REM + v + F.REM : '').join(F.SPLIT);
diffed_txt += txt;
// write('subtitles/step2-diff.txt', txt)
} else {
Expand All @@ -147,34 +121,35 @@ module.exports = function txt2srt(document, srt_list) {
}
});

const full_info_arr = diffed_txt
const diff_arr = diffed_txt
.trim()
.split('\n')
.map((line, n) => ({
diff: line,
raw: srt_obj_arr[n].sentence,
start: srt_obj_arr[n].start,
end: srt_obj_arr[n].end,
}));
.split('\n');

const full_info_arr = diff_arr.map((line, n) => ({
diff: line,
raw: srt_obj_arr[n].sentence,
start: srt_obj_arr[n].start,
end: srt_obj_arr[n].end,
}));
console.log('此时全量srt和符号srt的长度必定相同', full_info_arr.length, srt_obj_arr.length)

// 生成最终字幕文件
const dealed_arr = [];
let deleted = '', deleted_str = '';
let info = full_info_arr.shift();
while (info) {
if (info.diff.includes(SYMBOL.DEL_ST)) {
if (info.diff.includes(F.DEL_ST)) {
deleted = true;
deleted_str = info.diff.substring(0, info.diff.indexOf(SYMBOL.DEL_ST));
deleted_str = info.diff.substring(0, info.diff.indexOf(F.DEL_ST));
}
if (deleted || info.diff.replace(/\{\&\}$/, '').match(/^\+\{(.+)\}\+$/)) {
if (deleted || info.diff.match(/^\+([^-]+?)\+(\&)?$/)) {
// 多行和单行删除 跳过
} else {
if (info.diff.includes(SYMBOL.SPLIT)) {
if (info.diff.includes(F.SPLIT)) {
// 删除分割符拆分,分配时长,等待重新循环
// 计算拆分的时长 acc_ts
const segments = info.diff
.split(SYMBOL.SPLIT)
.split(F.SPLIT)
.map(diff_seg => {
// 还原为文稿简体
const original_txt = pure_diff(diff_seg);
Expand All @@ -196,13 +171,14 @@ module.exports = function txt2srt(document, srt_list) {
start: st,
end: st + seg.acc_ts,
});
st += seg.acc_ts;
});
full_info_arr.unshift(...additions);
} else if (info.diff.includes(SYMBOL.CONCAT)) {
} else if (info.diff.includes(F.CONCAT)) {
// 连接下一行,等待重新循环
const next = full_info_arr.shift();
const new_one = {
diff: info.diff.replace(SYMBOL.CONCAT, '') + next.diff,
diff: info.diff.replace(F.CONCAT, '') + next.diff,
raw: info.raw + next.raw,
start: info.start,
end: next.end,
Expand All @@ -219,8 +195,8 @@ module.exports = function txt2srt(document, srt_list) {
}
}

if (info.diff.includes(SYMBOL.DEL_ED)) {
info.diff = deleted_str + info.diff.substring(info.diff.indexOf(SYMBOL.DEL_ED) + SYMBOL.DEL_ED.length);
if (info.diff.includes(F.DEL_ED)) {
info.diff = deleted_str + info.diff.substring(info.diff.indexOf(F.DEL_ED) + F.DEL_ED.length);
deleted = false;
deleted_str = '';
// diff的小问题,删除在中间需要连接
Expand All @@ -231,25 +207,11 @@ module.exports = function txt2srt(document, srt_list) {
// 处理下一行 diff
info = full_info_arr.shift();
}

// 处理diff算法的一个小问题,前一行末尾是删除,后一行是添加,本该是一行的
document_dealed_arr.forEach(function (line, n) {
const curr = dealed_arr[n];
if (line === curr.line) {

} else {
const next = dealed_arr[n + 1];
if (line === curr.line + next.line) {
curr.line += next.line;
curr.end = next.end;
dealed_arr.splice(n + 1, 1);
}
}
});
console.log(dealed_arr.length, document_dealed_arr.length)
console.log('处理后的字幕对象和原始文稿一样长', dealed_arr.length, document_str_arr.length);

return dealed_arr.map((item, no) => {
return `${no + 1}\n${n2t(item.start)} --> ${n2t(item.end)}\n${document_str_arr[no]}\n${item.line}\n`;
return `${no + 1}\n${n2t(item.start)} --> ${n2t(item.end)}\n${document_str_arr[no]}\n\n`;
}).join('\n');

}
48 changes: 46 additions & 2 deletions nodejs/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const path = require('path');
const shelljs = require('shelljs');
const bodyParser = require('body-parser');
const crypto = require('crypto');
const multer = require('multer');
// const amqplib = require('amqplib');
const diffsrt = require('./diff-utils.js');

Expand Down Expand Up @@ -42,6 +43,21 @@ const got = require('got').default;
const FormData = require('form-data');
const qs = require('qs');

const storage = multer.diskStorage({
destination: function (req, file, cb) {
cb(null, path.join(__dirname, './.tmp'));
},
filename: function (req, file, cb) {
const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9);
cb(null, file.fieldname + '-' + uniqueSuffix + path.extname(file.originalname));
},
});
const uploader = multer({ storage: storage });
const multi = uploader.fields([
{ name: 'libretto', maxCount: 1 },
{ name: 'transcription', maxCount: 1 },
]);

app.use(bodyParser.json({ limit: '3mb' }));
//app.use(bodyParser.urlencoded({ limit: '3mb', extended: false }))
app.use(express.static('./static'));
Expand Down Expand Up @@ -140,10 +156,38 @@ app.get('/test/got-form', async (req, res) => {
res.json(data);
})

app.post('/test/diff-srt', async (req, res) => {
app.post('/diff-srt/json', async (req, res) => {
const srt = diffsrt(req.body.document, req.body.segments);
res.end(srt);
})
});

// ffmpeg -i input.mp4 -map-metadata -1 -c:a aac output.aac
app.post('/diff-srt/file', multi, async (req, res) => {
const transcription = (req.files.transcription || []).find(a => a.fieldname === 'transcription');
const libretto = (req.files.libretto || []).find(a => a.fieldname === 'libretto');
if (!transcription || !libretto) {
return res.end('MissFile');
}

try {
const o = JSON.parse(fs.readFileSync(transcription.path, { encoding: 'utf-8' }).toString());
const srt = diffsrt(
// 唱词
fs.readFileSync(libretto.path, { encoding: 'utf-8' }).toString(),
// 分节
o.segments.map(t => ({ text: t.text, start: t.start, end: t.end })),
)
res.end(srt);
} catch (e) {
console.log(e)
res.status(400);
res.end(e.message);
} finally {
fs.unlinkSync(libretto.path);
fs.unlinkSync(transcription.path);
}

});

app.listen(7003, function () {
console.log('express started at: 7003')
Expand Down
Loading

0 comments on commit 509c785

Please sign in to comment.