python & srt diff

ruanjiayou · Dec 11, 2024 · 509c785 · 509c785
1 parent 1959f64
commit 509c785
Show file tree

Hide file tree

Showing 18 changed files with 749 additions and 146 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-/**/node_modules/
+/**/node_modules/
+test/
diff --git a/ffmpeg/trans.sh b/ffmpeg/trans.sh
@@ -1,20 +1,22 @@
 # ffmepg的docker形式
-## alias
+- alias
 > 添加alias后可以快速使用,抽取音频: ff -i input.mp4 -vn output.mp3
 - linux
   > `alias ff="docker run -v $PWD:/data -w /data jrottenberg/ffmpeg:4.1-alpine -hide_banner $@"`
 - powershell
   > `function ff() { docker run -v /c/Users/Administrator/Downloads:/data -w /data --name temp jrottenberg/ffmpeg:4.1-alpine -hide_banner $args; docker rm temp }`
 
+- 查看编码器
+> -codecs -hide_banner |grep libmp3lame
 # mp4转m3u8
 -i /data/videos/-.mp4  -codec copy -vbsf h264_mp4toannexb -map 0 -f segment -segment_list /data/videos/m3u8/test.m3u8 -segment_time 5 /data/videos/m3u8/p_%03d.ts
 
 # m3u8转mp4 应该先 protocal_whitelist 再 allowed_extions 接着 -i 不然可能出错
 -i /data/videos/m3u8/test.m3u8 -allowed_extensions ALL -movflags faststart -protocol_whitelist file,tls,tcp,https,crypto -c copy /data/videos/mp4/test.mp4
 # 元信息放文件开头 -movflags faststart
 
-# 剪切视频
--i example.mp4 -ss 5m -t 10m output.mp4
+# 剪切视频 从 17s 开始,剪切 25s 长度
+-i example.mp4 -ss 00:17 -t 25 output.mp4
 
 # 截图
 -ss 5.1 -i /data/videos/-.mp4  -s 320x240 -frames:v 1 -f image2 /data/videos/screenshots/test.png

diff --git a/nas/proxy.js b/nas/proxy.js
@@ -0,0 +1,54 @@
+const fs = require("fs");
+const path = require('path');
+const os = require('os');
+const net = require('net');
+const shell = require('shelljs').exec;
+function getLocalIP() {
+  return new Promise((resolve, reject) => {
+    const interfaces = os.networkInterfaces();
+    for (const name of Object.keys(interfaces)) {
+      for (const interface of interfaces[name]) {
+        if (interface.family === 'IPv4' && !interface.internal) {
+          resolve(interface.address);
+          return;
+        }
+      }
+    }
+    reject(new Error('No public IP found.'));
+  });
+}
+
+getLocalIP().then(ip => {
+  console.log(ip);
+  const appName = 'Lantern';
+  // const r1 = shell(`osascript -e 'quit app "${appName}"'`);
+  // console.log(r1.code, r1.stderr);
+  // const filepath = path.join(__dirname, "settings.yml");
+  const filepath = "/Users/jiayou/Library/Application\ Support/Lantern/settings.yaml"
+  const lines = fs.readFileSync(filepath).toString().split('\n');
+  const news = lines.map(line => {
+    if (line.startsWith('addr:')) {
+      return line.split(':')[0] + ': ' + ip + ':8899';
+    } else if (line.startsWith('socksAddr:')) {
+      return line.split(':')[0] + ': ' + ip + ':9988';
+    } else {
+      return line;
+    }
+  });
+  fs.writeFileSync(filepath, news.join('\n'));
+
+  const gitfilepath = '/Users/jiayou/.gitconfig';
+  const lines2 = fs.readFileSync(gitfilepath).toString().split('\n');
+  const news2 = lines2.map(line => {
+    if (line.trim().startsWith('proxy')) {
+      return `        proxy = ${ip}:8899`
+    } else {
+      return line;
+    }
+  });
+  fs.writeFileSync(gitfilepath, news2.join('\n'));
+  const r2 = shell('open /Applications/Lantern.app');
+  console.log(r2.code, r2.stderr);
+  console.log('finished');
+})
+
diff --git a/nodejs/.gitignore b/nodejs/.gitignore
@@ -0,0 +1 @@
+.tmp/
diff --git a/nodejs/diff-utils.js b/nodejs/diff-utils.js
@@ -3,15 +3,15 @@ const path = require('path');
 const { simplecc } = require('simplecc-wasm');
 const { diffWordsWithSpace } = require('diff');
 
-del('subtitles/step2-diff.txt');
-
 // 标记符号
-const SYMBOL = {
-  SPLIT: '{|}',
-  CONCAT: '{&}',
-  MODIFY: '}-+{',
-  DEL_ST: 'x{',
-  DEL_ED: '}x',
+const F = {
+  SPLIT: '|',
+  CONCAT: '&',
+  MODIFY: '-+',
+  DEL_ST: '{',
+  DEL_ED: '}',
+  ADD: '+',
+  REM: '-',
 }
 // 相对路径转绝对路径
 function rel(file) {
@@ -26,14 +26,14 @@ function del(fullpath) {
 
 // 删除符号
 function clear(str) {
-  return str.replace(/[.,/#!$%^&*;:{}=_`~()'"\[\]?<>\\|@+-]/g, '').replace(/(“)|(”)|(《)|(》)|(\s*)/g, '');
+  return str.replace(/[\p{P}\p{S}\s]/gu, '');
 }
 
 function pure_diff(txt) {
-  return txt.replace(/-\{(.+?)\}-/g, '').replace(/(\+\{)|(\}\+)/g, '')
+  return txt.replace(/-(.+?)-/g, '').replace(/\+/g, '')
 }
 function apply_diff(txt) {
-  return txt.replace(/\+\{(.+?)\}\+/g, '').replace(/(-\{|()\}-)/g, '')
+  return txt.replace(/\+(.+?)\+/g, '').replace(/-/g, '')
 }
 
 function read(filepath) {
@@ -47,24 +47,24 @@ function write(filepath, txt) {
 function t2s(txt) {
   return simplecc(txt, 't2s');
 }
-// 字幕时间字符串转毫秒数
+// 字幕时间字符串转秒数
 function t2n(srtTime) {
   const [hours, minutes, seconds] = srtTime.split(":");
   const [secs, millis] = seconds.split(",");
 
-  const h = parseInt(hours, 10) * 3600000; // 小时 -> 毫秒
-  const m = parseInt(minutes, 10) * 60000; // 分钟 -> 毫秒
-  const s = parseInt(secs, 10) * 1000;     // 秒 -> 毫秒
-  const ms = parseInt(millis, 10);         // 毫秒
+  const h = parseInt(hours, 10) * 3600; // 小时 -> 毫秒
+  const m = parseInt(minutes, 10) * 60; // 分钟 -> 毫秒
+  const s = parseInt(secs, 10);     // 秒 -> 毫秒
+  const ms = parseInt(millis, 10) / 1000;         // 毫秒
 
   return h + m + s + ms;
 }
 
-function n2t(milliseconds) {
-  const hours = Math.floor(milliseconds / 3600000);
-  const minutes = Math.floor((milliseconds % 3600000) / 60000);
-  const seconds = Math.floor((milliseconds % 60000) / 1000);
-  const millis = milliseconds % 1000;
+function n2t(seconds) {
+  const millis = Math.round((seconds % 1) * 1000);
+  const hours = Math.floor(seconds / 3600);
+  const minutes = Math.floor((seconds % 3600) / 60);
+  seconds = Math.floor((seconds % 60));
 
   return `${String(hours).padStart(2, "0")}:${String(minutes).padStart(2, "0")}:${String(seconds).padStart(2, "0")},${String(millis).padStart(3, "0").substring(0, 3)}`;
 }
@@ -98,46 +98,20 @@ module.exports = function txt2srt(document, srt_list) {
         // 首尾换行交换
         const start_br = part.value.startsWith('\n');
         const end_br = part.value.endsWith('\n');
-        const txt = `${start_br ? '\n' : ''}x{${part.value.trim()}}x${end_br ? '\n' : ''}`;
+        const txt = `${start_br ? '\n' : ''}` + F.DEL_ST + part.value.trim() + F.DEL_ED + `${end_br ? '\n' : ''}`;
         diffed_txt += txt;
         // write('subtitles/step2-diff.txt', txt);
-      } else if (part.value === '\n') {
-        // 多了个换行就是被分割了需要合并
-        diffed_txt += '{&}\n'
-        // write('subtitles/step2-diff.txt', `{&}\n`);
       } else if (part.value.includes('\n')) {
-        const is_start = part.value.startsWith('\n')
-        const txt = `${is_start && !diffs[nth - 1].value.endsWith('}-') ? '\n' : ''}+{${part.value.trim()}}+${is_start ? '' : '{&}\n'}`
+        const txt = part.value.split('\n').map(v => v !== '' ? F.ADD + v + F.ADD : '').join(F.CONCAT + '\n')
         diffed_txt += txt;
         // write('subtitles/step2-diff.txt', txt);
       } else {
-        const txt = `+{${part.value}}+`;
+        const txt = `${F.ADD}${part.value}${F.ADD}`;
         diffed_txt += txt;
         // write('subtitles/step2-diff.txt', txt)
       }
     } else if (part.removed) {
-      // 有换行就需要分割时间区间，一般应该只有一个换行在开头
-      let real_value = part.value;
-      let txt = '';
-      if (part.value.startsWith('\n')) {
-        txt += '{|}'
-        real_value = part.value.replace(/^\n/, '')
-        if (real_value && !real_value.endsWith('\n')) {
-          txt += `-{${real_value}}-`;
-          real_value = '';
-        }
-      }
-      if (real_value.endsWith('\n')) {
-        real_value = real_value.replace(/\n$/, '')
-        if (real_value) {
-          txt += `-{${real_value}}-`
-          real_value = '';
-        }
-        txt += '{|}'
-      }
-      if (real_value) {
-        txt += `-{${real_value}}-`
-      }
+      const txt = part.value.split('\n').map(v => v !== '' ? F.REM + v + F.REM : '').join(F.SPLIT);
       diffed_txt += txt;
       // write('subtitles/step2-diff.txt', txt)
     } else {
@@ -147,34 +121,35 @@ module.exports = function txt2srt(document, srt_list) {
     }
   });
 
-  const full_info_arr = diffed_txt
+  const diff_arr = diffed_txt
     .trim()
-    .split('\n')
-    .map((line, n) => ({
-      diff: line,
-      raw: srt_obj_arr[n].sentence,
-      start: srt_obj_arr[n].start,
-      end: srt_obj_arr[n].end,
-    }));
+    .split('\n');
+
+  const full_info_arr = diff_arr.map((line, n) => ({
+    diff: line,
+    raw: srt_obj_arr[n].sentence,
+    start: srt_obj_arr[n].start,
+    end: srt_obj_arr[n].end,
+  }));
   console.log('此时全量srt和符号srt的长度必定相同', full_info_arr.length, srt_obj_arr.length)
 
   // 生成最终字幕文件
   const dealed_arr = [];
   let deleted = '', deleted_str = '';
   let info = full_info_arr.shift();
   while (info) {
-    if (info.diff.includes(SYMBOL.DEL_ST)) {
+    if (info.diff.includes(F.DEL_ST)) {
       deleted = true;
-      deleted_str = info.diff.substring(0, info.diff.indexOf(SYMBOL.DEL_ST));
+      deleted_str = info.diff.substring(0, info.diff.indexOf(F.DEL_ST));
     }
-    if (deleted || info.diff.replace(/\{\&\}$/, '').match(/^\+\{(.+)\}\+$/)) {
+    if (deleted || info.diff.match(/^\+([^-]+?)\+(\&)?$/)) {
       // 多行和单行删除 跳过
     } else {
-      if (info.diff.includes(SYMBOL.SPLIT)) {
+      if (info.diff.includes(F.SPLIT)) {
         // 删除分割符拆分,分配时长,等待重新循环
         // 计算拆分的时长 acc_ts
         const segments = info.diff
-          .split(SYMBOL.SPLIT)
+          .split(F.SPLIT)
           .map(diff_seg => {
             // 还原为文稿简体
             const original_txt = pure_diff(diff_seg);
@@ -196,13 +171,14 @@ module.exports = function txt2srt(document, srt_list) {
             start: st,
             end: st + seg.acc_ts,
           });
+          st += seg.acc_ts;
         });
         full_info_arr.unshift(...additions);
-      } else if (info.diff.includes(SYMBOL.CONCAT)) {
+      } else if (info.diff.includes(F.CONCAT)) {
         // 连接下一行,等待重新循环
         const next = full_info_arr.shift();
         const new_one = {
-          diff: info.diff.replace(SYMBOL.CONCAT, '') + next.diff,
+          diff: info.diff.replace(F.CONCAT, '') + next.diff,
           raw: info.raw + next.raw,
           start: info.start,
           end: next.end,
@@ -219,8 +195,8 @@ module.exports = function txt2srt(document, srt_list) {
       }
     }
 
-    if (info.diff.includes(SYMBOL.DEL_ED)) {
-      info.diff = deleted_str + info.diff.substring(info.diff.indexOf(SYMBOL.DEL_ED) + SYMBOL.DEL_ED.length);
+    if (info.diff.includes(F.DEL_ED)) {
+      info.diff = deleted_str + info.diff.substring(info.diff.indexOf(F.DEL_ED) + F.DEL_ED.length);
       deleted = false;
       deleted_str = '';
       // diff的小问题,删除在中间需要连接
@@ -231,25 +207,11 @@ module.exports = function txt2srt(document, srt_list) {
     // 处理下一行 diff
     info = full_info_arr.shift();
   }
-
-  // 处理diff算法的一个小问题,前一行末尾是删除,后一行是添加,本该是一行的
-  document_dealed_arr.forEach(function (line, n) {
-    const curr = dealed_arr[n];
-    if (line === curr.line) {
-
-    } else {
-      const next = dealed_arr[n + 1];
-      if (line === curr.line + next.line) {
-        curr.line += next.line;
-        curr.end = next.end;
-        dealed_arr.splice(n + 1, 1);
-      }
-    }
-  });
+  console.log(dealed_arr.length, document_dealed_arr.length)
   console.log('处理后的字幕对象和原始文稿一样长', dealed_arr.length, document_str_arr.length);
 
   return dealed_arr.map((item, no) => {
-    return `${no + 1}\n${n2t(item.start)} --> ${n2t(item.end)}\n${document_str_arr[no]}\n${item.line}\n`;
+    return `${no + 1}\n${n2t(item.start)} --> ${n2t(item.end)}\n${document_str_arr[no]}\n\n`;
   }).join('\n');
 
 }
diff --git a/nodejs/index.js b/nodejs/index.js
@@ -7,6 +7,7 @@ const path = require('path');
 const shelljs = require('shelljs');
 const bodyParser = require('body-parser');
 const crypto = require('crypto');
+const multer = require('multer');
 // const amqplib = require('amqplib');
 const diffsrt = require('./diff-utils.js');
 
@@ -42,6 +43,21 @@ const got = require('got').default;
 const FormData = require('form-data');
 const qs = require('qs');
 
+const storage = multer.diskStorage({
+  destination: function (req, file, cb) {
+    cb(null, path.join(__dirname, './.tmp'));
+  },
+  filename: function (req, file, cb) {
+    const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9);
+    cb(null, file.fieldname + '-' + uniqueSuffix + path.extname(file.originalname));
+  },
+});
+const uploader = multer({ storage: storage });
+const multi = uploader.fields([
+  { name: 'libretto', maxCount: 1 },
+  { name: 'transcription', maxCount: 1 },
+]);
+
 app.use(bodyParser.json({ limit: '3mb' }));
 //app.use(bodyParser.urlencoded({ limit: '3mb', extended: false }))
 app.use(express.static('./static'));
@@ -140,10 +156,38 @@ app.get('/test/got-form', async (req, res) => {
   res.json(data);
 })
 
-app.post('/test/diff-srt', async (req, res) => {
+app.post('/diff-srt/json', async (req, res) => {
   const srt = diffsrt(req.body.document, req.body.segments);
   res.end(srt);
-})
+});
+
+// ffmpeg -i input.mp4 -map-metadata -1 -c:a aac output.aac
+app.post('/diff-srt/file', multi, async (req, res) => {
+  const transcription = (req.files.transcription || []).find(a => a.fieldname === 'transcription');
+  const libretto = (req.files.libretto || []).find(a => a.fieldname === 'libretto');
+  if (!transcription || !libretto) {
+    return res.end('MissFile');
+  }
+
+  try {
+    const o = JSON.parse(fs.readFileSync(transcription.path, { encoding: 'utf-8' }).toString());
+    const srt = diffsrt(
+      // 唱词
+      fs.readFileSync(libretto.path, { encoding: 'utf-8' }).toString(),
+      // 分节
+      o.segments.map(t => ({ text: t.text, start: t.start, end: t.end })),
+    )
+    res.end(srt);
+  } catch (e) {
+    console.log(e)
+    res.status(400);
+    res.end(e.message);
+  } finally {
+    fs.unlinkSync(libretto.path);
+    fs.unlinkSync(transcription.path);
+  }
+
+});
 
 app.listen(7003, function () {
   console.log('express started at: 7003')