Compare commits
	
		
			2 Commits
		
	
	
		
			03922a76d2
			...
			ffce9aecdf
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
|  | ffce9aecdf | ||
|  | 4dbf9d9c03 | 
							
								
								
									
										2
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								Makefile
									
									
									
									
									
								
							| @@ -1,2 +1,2 @@ | |||||||
| clean: | clean: | ||||||
| 	rm -f *.ppm *.pcm part* | 	rm -f *.ppm *.pcm part* *.srt | ||||||
|   | |||||||
							
								
								
									
										208
									
								
								removeads.py
									
									
									
									
									
								
							
							
						
						
									
										208
									
								
								removeads.py
									
									
									
									
									
								
							| @@ -18,7 +18,8 @@ from select import select | |||||||
| from math import floor, ceil, log | from math import floor, ceil, log | ||||||
| from shutil import copyfile, which | from shutil import copyfile, which | ||||||
| import hexdump | import hexdump | ||||||
|  | from iso639 import Lang | ||||||
|  | from iso639.exceptions import InvalidLanguageValue | ||||||
|  |  | ||||||
| # Useful SPS/PPS discussion | # Useful SPS/PPS discussion | ||||||
| # TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings. | # TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings. | ||||||
| @@ -106,13 +107,13 @@ def getSubTitlesTracks(ffprobe, mkvPath): | |||||||
|      |      | ||||||
|     return tracks |     return tracks | ||||||
|  |  | ||||||
| def extractSRT(mkvextract, mkvPath, destPath, tracks, langs): | def extractSRT(mkvextract, mkvPath, subtitles, langs): | ||||||
|     logger = logging.getLogger(__name__) |     logger = logging.getLogger(__name__) | ||||||
|     params = [mkvextract, mkvPath, 'tracks'] |     params = [mkvextract, mkvPath, 'tracks'] | ||||||
|      |      | ||||||
|     res = [] |     res = [] | ||||||
|  |  | ||||||
|     for lang in tracks: |     for lang in subtitles: | ||||||
|         iso = Lang(lang) |         iso = Lang(lang) | ||||||
|          |          | ||||||
|         if iso in langs: |         if iso in langs: | ||||||
| @@ -121,47 +122,80 @@ def extractSRT(mkvextract, mkvPath, destPath, tracks, langs): | |||||||
|             logger.warning("Language not supported by Tesseract: %s" % iso.name) |             logger.warning("Language not supported by Tesseract: %s" % iso.name) | ||||||
|             ocrlang ='osd' |             ocrlang ='osd' | ||||||
|              |              | ||||||
|         if len(tracks[lang]) == 1: |         if len(subtitles[lang]) == 1: | ||||||
|                 params.append('%d:%s/%s' % (tracks[lang][0], destPath ,lang)) |                 params.append('%d:%s' % (subtitles[lang][0], lang)) | ||||||
|                 res.append(('%s/%s.idx' % (destPath, lang), lang, ocrlang)) |                 res.append(('%s.idx' % lang, '%s.sub' % lang, lang, ocrlang)) | ||||||
|         else: |         else: | ||||||
|             count = 1 |             count = 1 | ||||||
|             for track in tracks[lang]: |             for track in subtitles[lang]: | ||||||
|                 params.append('%d:%s/%s-%d' % (track, destPath, lang, count)) |                 params.append('%d:%s-%d' % (track, lang, count)) | ||||||
|                 res.append(('%s/%s-%d.idx' % (destPath, lang,count), lang, ocrlang)) |                 res.append(('%s-%d.idx' % (lang,count), '%s-%d.sub' % (lang,count), lang, ocrlang)) | ||||||
|                 count = count+1 |                 count = count+1 | ||||||
|  |  | ||||||
|     with Popen(params) as extract: |     env = {**os.environ, 'LANG': 'C'} | ||||||
|  |     with Popen(params, env=env) as extract: | ||||||
|         extract.wait() |         extract.wait() | ||||||
|          |          | ||||||
|     if extract.returncode != 0: |     if extract.returncode != 0: | ||||||
|         print("Erreur de mkvextract: %d" % extract.returncode) |         logger.error('Mkvextract returns an error code: %d' % extract.returncode) | ||||||
|  |         return None | ||||||
|     else: |     else: | ||||||
|         print("Extracted") |         logger.info('Subtitle tracks were succesfully extracted.') | ||||||
|  |  | ||||||
|     return res |     return res | ||||||
|  |  | ||||||
| def doOCR(vobsubocr, idxs): | def doOCR(vobsubocr, idxs, duration, temporaries, dumpMemFD=False): | ||||||
|  |     logger = logging.getLogger(__name__) | ||||||
|     res = [] |     res = [] | ||||||
|      |      | ||||||
|     for filename, lang, iso in idxs: |     for idxName, subName, lang, iso in idxs: | ||||||
|         print(filename) |         srtname =  '%s.srt' % os.path.splitext(idxName)[0] | ||||||
|         srtname =  '%s.srt' % os.path.splitext(filename)[0] |         # Tesseract seems to recognize the three dots ... as "su" | ||||||
|         print(srtname) |         ldots = re.compile('^su\n$') | ||||||
|         # Tesseract reconnaît la chaîne de caractères ... comme le texte 'su' |         timestamps = re.compile('^[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} \-\-> (?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}),[0-9]{3}$') | ||||||
|         p = re.compile('^su\n$') |  | ||||||
|  |  | ||||||
|         if not os.path.isfile(srtname): |         srtfd = memfd_create(srtname, flags=0) | ||||||
|             with open(srtname, 'w+') as srt: |         with Popen([vobsubocr, '--lang', iso, idxName], stdout=PIPE) as ocr: | ||||||
|                 with Popen([vobsubocr, '--lang', iso, filename], stdout=PIPE) as ocr: |             pb = tqdm(TextIOWrapper(ocr.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='OCR') | ||||||
|                     for line in ocr.stdout: |             for line in pb: | ||||||
|                         line = line.decode('utf8') |                 m = re.match(ldots,line) | ||||||
|                         m = re.match(p,line) |  | ||||||
|                 if m != None: |                 if m != None: | ||||||
|                             srt.write('...') |                     write(srtfd, '...'.encode(encoding='UTF-8')) | ||||||
|                 else: |                 else: | ||||||
|                             srt.write(line) |                     write(srtfd, line.encode(encoding='UTF-8')) | ||||||
|         res.append((srtname, lang)) |                      | ||||||
|  |                 m = re.match(timestamps, line) | ||||||
|  |                 if m!=None: | ||||||
|  |                     hours = int(m.group('hours')) | ||||||
|  |                     minutes = int(m.group('hours')) | ||||||
|  |                     seconds = int(m.group('seconds')) | ||||||
|  |                     ts = timedelta(hours=hours, minutes=minutes, seconds=seconds) | ||||||
|  |                     pb.n = int(ts/timedelta(seconds=1)) | ||||||
|  |                     pb.update() | ||||||
|  |  | ||||||
|  |         status = ocr.wait() | ||||||
|  |      | ||||||
|  |         if status != 0: | ||||||
|  |             logger.error('OCR failed with status code: %d' % status) | ||||||
|  |              | ||||||
|  |         if dumpMemFD: | ||||||
|  |             try: | ||||||
|  |                 dumpSrt = open(srtname,'w') | ||||||
|  |             except IOError: | ||||||
|  |                 logger.error('Impossible to create file: %s' % srtname) | ||||||
|  |                 return None | ||||||
|  |  | ||||||
|  |             lseek(srtfd, 0, SEEK_SET) | ||||||
|  |             srtLength = fstat(srtfd).st_size  | ||||||
|  |             buf = read(srtfd, srtLength) | ||||||
|  |             outfd = dumpSrt.fileno() | ||||||
|  |             pos = 0 | ||||||
|  |             while pos < srtLength: | ||||||
|  |                 pos+=write(outfd, buf[pos:]) | ||||||
|  |  | ||||||
|  |             temporaries.append(dumpSrt) | ||||||
|  |  | ||||||
|  |         res.append((srtfd, lang)) | ||||||
|  |  | ||||||
|     return res |     return res | ||||||
|  |  | ||||||
| @@ -498,6 +532,44 @@ def getFormat(ffprobe, inputFile): | |||||||
|  |  | ||||||
|     return None |     return None | ||||||
|  |  | ||||||
|  |  | ||||||
|  | def getMovieDuration(ffprobe, inputFile): | ||||||
|  |     logger = logging.getLogger(__name__) | ||||||
|  |          | ||||||
|  |     infd = inputFile.fileno() | ||||||
|  |     lseek(infd, 0, SEEK_SET) | ||||||
|  |     set_inheritable(infd, True) | ||||||
|  |     with Popen([ffprobe, '-loglevel', 'quiet', '-show_format', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe: | ||||||
|  |         out, _ = ffprobe.communicate() | ||||||
|  |         out = json.load(BytesIO(out)) | ||||||
|  |         if 'format' in out and 'duration' in out['format']: | ||||||
|  |             duration = floor(float(out['format']['duration'])) | ||||||
|  |             ts = timedelta(seconds=duration) | ||||||
|  |             return ts | ||||||
|  |         else: | ||||||
|  |             logger.error('Impossible to retrieve duration of movie') | ||||||
|  |  | ||||||
|  |     return None | ||||||
|  |  | ||||||
|  | # ffprobe -loglevel quiet -select_streams v:0 -show_entries stream=width,height -of json ./talons.ts | ||||||
|  | def getVideoDimensions(ffprobe, inputFile): | ||||||
|  |     logger = logging.getLogger(__name__) | ||||||
|  |          | ||||||
|  |     infd = inputFile.fileno() | ||||||
|  |     lseek(infd, 0, SEEK_SET) | ||||||
|  |     set_inheritable(infd, True) | ||||||
|  |     with Popen([ffprobe, '-loglevel', 'quiet', '-select_streams', 'v:0',  '-show_entries', 'stream=width,height', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe: | ||||||
|  |         out, _ = ffprobe.communicate() | ||||||
|  |         out = json.load(BytesIO(out)) | ||||||
|  |         if 'streams' in out: | ||||||
|  |             video = out['streams'][0] | ||||||
|  |             if ('width' in video) and ('height' in video): | ||||||
|  |                 return int(video['width']), int(video['height']) | ||||||
|  |      | ||||||
|  |     logger.error('Impossible to retrieve dimensions of video') | ||||||
|  |     exit(-1) | ||||||
|  |  | ||||||
|  |  | ||||||
| def getStreams(ffprobe, inputFile): | def getStreams(ffprobe, inputFile): | ||||||
|     logger = logging.getLogger(__name__) |     logger = logging.getLogger(__name__) | ||||||
|          |          | ||||||
| @@ -637,17 +709,16 @@ def compareTimeInterval(interval1, interval2): | |||||||
|     else: |     else: | ||||||
|         return 0 |         return 0 | ||||||
|  |  | ||||||
|  | def ffmpegConvert(ffmpeg, ffprobe, inputFile, inputFormat, outputFile, outputFormat, duration): | ||||||
|  |  | ||||||
| def ffmpegConvert(ffmpeg, inputFile, inputFormat, outputFile, outputFormat, duration): |  | ||||||
|     logger = logging.getLogger(__name__) |     logger = logging.getLogger(__name__) | ||||||
|      |      | ||||||
|  |     width, height =  getVideoDimensions(ffprobe, inputFile) | ||||||
|  |      | ||||||
|     infd = inputFile.fileno() |     infd = inputFile.fileno() | ||||||
|     outfd = outputFile.fileno() |     outfd = outputFile.fileno() | ||||||
|     set_inheritable(infd, True) |     set_inheritable(infd, True) | ||||||
|     set_inheritable(outfd, True) |     set_inheritable(outfd, True) | ||||||
|     # TODO: canvas size to be fixed ! |     with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '%dx%d' % (width, height),  '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,  | ||||||
|     with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '720x560', '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,  |  | ||||||
|                 '-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub',  |                 '-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub',  | ||||||
|                 '-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg: |                 '-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg: | ||||||
|         pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion') |         pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion') | ||||||
| @@ -1119,18 +1190,23 @@ def mergeMKVs(inputs, outputName): | |||||||
|      |      | ||||||
|     return out |     return out | ||||||
|  |  | ||||||
| def findSubtitlesTracks(ffprobe, filename): | def findSubtitlesTracks(ffprobe, inputFile): | ||||||
|     # ffprobe -loglevel quiet -select_streams s -show_entries stream=index:stream_tags=language -of json corgi.ts |  | ||||||
|     logger = logging.getLogger(__name__) |     logger = logging.getLogger(__name__) | ||||||
|      |      | ||||||
|     with Popen([ffprobe, '-i', filename, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe: |     infd = inputFile.fileno() | ||||||
|  |     lseek(infd, 0, SEEK_SET) | ||||||
|  |     set_inheritable(infd, True) | ||||||
|  |      | ||||||
|  |     with Popen([ffprobe, '-loglevel','quiet', '-i', '/proc/self/fd/%d' % infd, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe: | ||||||
|         out, _ = ffprobe.communicate() |         out, _ = ffprobe.communicate() | ||||||
|         out = json.load(BytesIO(out)) |         out = json.load(BytesIO(out)) | ||||||
|         if 'streams' in out: |         if 'streams' in out: | ||||||
|             return out['streams'] |             return out['streams'] | ||||||
|         else: |         else: | ||||||
|             logger.error('Impossible to retrieve format of file') |             logger.error('Impossible to retrieve format of file') | ||||||
|     pass |      | ||||||
|  |     ffprobe.wait() | ||||||
|  |      | ||||||
|  |  | ||||||
| def extractSubTitleTrack(mkvmerge, inputFileName, index, lang): | def extractSubTitleTrack(mkvmerge, inputFileName, index, lang): | ||||||
|     #  mkvextract video.mkv tracks position:nom [position:nom] |     #  mkvextract video.mkv tracks position:nom [position:nom] | ||||||
| @@ -1228,16 +1304,16 @@ def main(): | |||||||
|     if formatOfFile == SupportedFormat.TS: |     if formatOfFile == SupportedFormat.TS: | ||||||
|         logger.info("Converting TS to MP4 (to fix timestamps).") |         logger.info("Converting TS to MP4 (to fix timestamps).") | ||||||
|         try: |         try: | ||||||
|             with open(mp4filename, 'w') as mp4:  |             with open(mp4filename, 'w+') as mp4:  | ||||||
|                 ffmpegConvert(paths['ffmpeg'], inputFile, 'mpegts', mp4, 'mp4', duration) |                 ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], inputFile, 'mpegts', mp4, 'mp4', duration) | ||||||
|                 temporaries.append(mp4) |                 temporaries.append(mp4) | ||||||
|                 logger.info("Converting MP4 to MKV.") |                 logger.info("Converting MP4 to MKV.") | ||||||
|                 try: |                 try: | ||||||
|                     mkv = open(mkvfilename, 'w') |                     mkv = open(mkvfilename, 'w+') | ||||||
|                 except IOError: |                 except IOError: | ||||||
|                     logger.error('') |                     logger.error('') | ||||||
|                      |                      | ||||||
|                 ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration) |                 ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration) | ||||||
|                 if nbParts > 0: |                 if nbParts > 0: | ||||||
|                     temporaries.append(mkv) |                     temporaries.append(mkv) | ||||||
|         except IOError: |         except IOError: | ||||||
| @@ -1249,7 +1325,7 @@ def main(): | |||||||
|             mkv = open(mkvfilename, 'w') |             mkv = open(mkvfilename, 'w') | ||||||
|         except IOError: |         except IOError: | ||||||
|             logger.error('') |             logger.error('') | ||||||
|         ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration) |         ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration) | ||||||
|         if nbParts > 0: |         if nbParts > 0: | ||||||
|             temporaries.append(mkv) |             temporaries.append(mkv) | ||||||
|     else: |     else: | ||||||
| @@ -1368,8 +1444,18 @@ def main(): | |||||||
|         if not allOptionalTools: |         if not allOptionalTools: | ||||||
|             logger.warning("Missing tools for extracting subtitles.") |             logger.warning("Missing tools for extracting subtitles.") | ||||||
|         else: |         else: | ||||||
|             logger.info("Find subtitles tracks and language.") |             try: | ||||||
|             subtitles = findSubtitlesTracks(args.outputFile) |                 final = open(args.outputFile, mode='r') | ||||||
|  |             except IOError: | ||||||
|  |                 logger.error("Impossible to open %s to finalize processing." % args.outputFile) | ||||||
|  |                 exit(-1) | ||||||
|  |              | ||||||
|  |             duration = getMovieDuration(paths['ffprobe'], final) | ||||||
|  |             supportedLangs = getTesseractSupportedLang(paths['tesseract']) | ||||||
|  |             logger.info('Supported lang: %s' % supportedLangs) | ||||||
|  |             logger.info('Find subtitles tracks and language.') | ||||||
|  |             subtitles = findSubtitlesTracks(paths['ffprobe'], final) | ||||||
|  |             logger.info(subtitles) | ||||||
|             sts = {} |             sts = {} | ||||||
|             for subtitle in subtitles: |             for subtitle in subtitles: | ||||||
|                 index = subtitle['index'] |                 index = subtitle['index'] | ||||||
| @@ -1385,19 +1471,27 @@ def main(): | |||||||
|                 else: |                 else: | ||||||
|                     logger.error("Dropping subtitle: %s because it is missing language indication") |                     logger.error("Dropping subtitle: %s because it is missing language indication") | ||||||
|              |              | ||||||
|             for lang in sts: |             logger.debug(sts) | ||||||
|                 indexes = sts[lang] |             listOfSubtitles = extractSRT(paths['mkvextract'], args.outputFile, sts, supportedLangs) | ||||||
|                 if len(indexes) == 0: |             logger.info(listOfSubtitles) | ||||||
|                     # Nothing to do. This should not happen. |             for idxName, subName, _, _ in listOfSubtitles: | ||||||
|                     continue |                 try: | ||||||
|                 if len(indexes) == 1: |                     idx = open(idxName,'r') | ||||||
|                     index = indexes[0] |                 except IOError: | ||||||
|                     filename = 'essai-%s.srt' % lang |                     logger.error("Impossible to open %s." % idxName) | ||||||
|                 elif len(indexes) > 1: |                     exit(-1) | ||||||
|                     nbsrt = 1 |                 try: | ||||||
|                     for index in indexes: |                     sub = open(subName,'r') | ||||||
|                         filename = 'essai-%s-%d.srt' % (lang, nbsrt) |                 except IOError: | ||||||
|                         nbsrt+=1 |                     logger.error("Impossible to open %s." % subName) | ||||||
|  |                     exit(-1) | ||||||
|  |                  | ||||||
|  |                 temporaries.append(idx) | ||||||
|  |                 temporaries.append(sub) | ||||||
|  |          | ||||||
|  |             ocr = doOCR(paths['vobsubocr'], listOfSubtitles, duration, temporaries, args.dump) | ||||||
|  |             logger.info(ocr) | ||||||
|  |              | ||||||
|    |    | ||||||
|     if not args.keep: |     if not args.keep: | ||||||
|         logger.info("Cleaning temporary files") |         logger.info("Cleaning temporary files") | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user