diff --git a/removeads.py b/removeads.py index cab5920..1acb168 100755 --- a/removeads.py +++ b/removeads.py @@ -18,7 +18,8 @@ from select import select from math import floor, ceil, log from shutil import copyfile, which import hexdump - +from iso639 import Lang +from iso639.exceptions import InvalidLanguageValue # Useful SPS/PPS discussion # TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings. @@ -106,13 +107,13 @@ def getSubTitlesTracks(ffprobe, mkvPath): return tracks -def extractSRT(mkvextract, mkvPath, destPath, tracks, langs): +def extractSRT(mkvextract, mkvPath, subtitles, langs): logger = logging.getLogger(__name__) params = [mkvextract, mkvPath, 'tracks'] res = [] - for lang in tracks: + for lang in subtitles: iso = Lang(lang) if iso in langs: @@ -121,47 +122,80 @@ def extractSRT(mkvextract, mkvPath, destPath, tracks, langs): logger.warning("Language not supported by Tesseract: %s" % iso.name) ocrlang ='osd' - if len(tracks[lang]) == 1: - params.append('%d:%s/%s' % (tracks[lang][0], destPath ,lang)) - res.append(('%s/%s.idx' % (destPath, lang), lang, ocrlang)) + if len(subtitles[lang]) == 1: + params.append('%d:%s' % (subtitles[lang][0], lang)) + res.append(('%s.idx' % lang, '%s.sub' % lang, lang, ocrlang)) else: count = 1 - for track in tracks[lang]: - params.append('%d:%s/%s-%d' % (track, destPath, lang, count)) - res.append(('%s/%s-%d.idx' % (destPath, lang,count), lang, ocrlang)) + for track in subtitles[lang]: + params.append('%d:%s-%d' % (track, lang, count)) + res.append(('%s-%d.idx' % (lang,count), '%s-%d.sub' % (lang,count), lang, ocrlang)) count = count+1 - with Popen(params) as extract: + env = {**os.environ, 'LANG': 'C'} + with Popen(params, env=env) as extract: extract.wait() if extract.returncode != 0: - print("Erreur de mkvextract: %d" % extract.returncode) + logger.error('Mkvextract returns an error code: %d' % extract.returncode) + return None else: - print("Extracted") + logger.info('Subtitle tracks were succesfully extracted.') return res -def doOCR(vobsubocr, idxs): +def doOCR(vobsubocr, idxs, duration, temporaries, dumpMemFD=False): + logger = logging.getLogger(__name__) res = [] - for filename, lang, iso in idxs: - print(filename) - srtname = '%s.srt' % os.path.splitext(filename)[0] - print(srtname) - # Tesseract reconnaît la chaîne de caractères ... comme le texte 'su' - p = re.compile('^su\n$') + for idxName, subName, lang, iso in idxs: + srtname = '%s.srt' % os.path.splitext(idxName)[0] + # Tesseract seems to recognize the three dots ... as "su" + ldots = re.compile('^su\n$') + timestamps = re.compile('^[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} \-\-> (?P[0-9]{2}):(?P[0-9]{2}):(?P[0-9]{2}),[0-9]{3}$') - if not os.path.isfile(srtname): - with open(srtname, 'w+') as srt: - with Popen([vobsubocr, '--lang', iso, filename], stdout=PIPE) as ocr: - for line in ocr.stdout: - line = line.decode('utf8') - m = re.match(p,line) - if m != None: - srt.write('...') - else: - srt.write(line) - res.append((srtname, lang)) + srtfd = memfd_create(srtname, flags=0) + with Popen([vobsubocr, '--lang', iso, idxName], stdout=PIPE) as ocr: + pb = tqdm(TextIOWrapper(ocr.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='OCR') + for line in pb: + m = re.match(ldots,line) + if m != None: + write(srtfd, '...'.encode(encoding='UTF-8')) + else: + write(srtfd, line.encode(encoding='UTF-8')) + + m = re.match(timestamps, line) + if m!=None: + hours = int(m.group('hours')) + minutes = int(m.group('hours')) + seconds = int(m.group('seconds')) + ts = timedelta(hours=hours, minutes=minutes, seconds=seconds) + pb.n = int(ts/timedelta(seconds=1)) + pb.update() + + status = ocr.wait() + + if status != 0: + logger.error('OCR failed with status code: %d' % status) + + if dumpMemFD: + try: + dumpSrt = open(srtname,'w') + except IOError: + logger.error('Impossible to create file: %s' % srtname) + return None + + lseek(srtfd, 0, SEEK_SET) + srtLength = fstat(srtfd).st_size + buf = read(srtfd, srtLength) + outfd = dumpSrt.fileno() + pos = 0 + while pos < srtLength: + pos+=write(outfd, buf[pos:]) + + temporaries.append(dumpSrt) + + res.append((srtfd, lang)) return res @@ -498,6 +532,44 @@ def getFormat(ffprobe, inputFile): return None + +def getMovieDuration(ffprobe, inputFile): + logger = logging.getLogger(__name__) + + infd = inputFile.fileno() + lseek(infd, 0, SEEK_SET) + set_inheritable(infd, True) + with Popen([ffprobe, '-loglevel', 'quiet', '-show_format', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe: + out, _ = ffprobe.communicate() + out = json.load(BytesIO(out)) + if 'format' in out and 'duration' in out['format']: + duration = floor(float(out['format']['duration'])) + ts = timedelta(seconds=duration) + return ts + else: + logger.error('Impossible to retrieve duration of movie') + + return None + +# ffprobe -loglevel quiet -select_streams v:0 -show_entries stream=width,height -of json ./talons.ts +def getVideoDimensions(ffprobe, inputFile): + logger = logging.getLogger(__name__) + + infd = inputFile.fileno() + lseek(infd, 0, SEEK_SET) + set_inheritable(infd, True) + with Popen([ffprobe, '-loglevel', 'quiet', '-select_streams', 'v:0', '-show_entries', 'stream=width,height', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe: + out, _ = ffprobe.communicate() + out = json.load(BytesIO(out)) + if 'streams' in out: + video = out['streams'][0] + if ('width' in video) and ('height' in video): + return int(video['width']), int(video['height']) + + logger.error('Impossible to retrieve dimensions of video') + exit(-1) + + def getStreams(ffprobe, inputFile): logger = logging.getLogger(__name__) @@ -637,17 +709,16 @@ def compareTimeInterval(interval1, interval2): else: return 0 - - -def ffmpegConvert(ffmpeg, inputFile, inputFormat, outputFile, outputFormat, duration): +def ffmpegConvert(ffmpeg, ffprobe, inputFile, inputFormat, outputFile, outputFormat, duration): logger = logging.getLogger(__name__) + width, height = getVideoDimensions(ffprobe, inputFile) + infd = inputFile.fileno() outfd = outputFile.fileno() set_inheritable(infd, True) set_inheritable(outfd, True) - # TODO: canvas size to be fixed ! - with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '720x560', '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd, + with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '%dx%d' % (width, height), '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd, '-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub', '-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg: pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion') @@ -1119,18 +1190,23 @@ def mergeMKVs(inputs, outputName): return out -def findSubtitlesTracks(ffprobe, filename): - # ffprobe -loglevel quiet -select_streams s -show_entries stream=index:stream_tags=language -of json corgi.ts +def findSubtitlesTracks(ffprobe, inputFile): logger = logging.getLogger(__name__) - with Popen([ffprobe, '-i', filename, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe: + infd = inputFile.fileno() + lseek(infd, 0, SEEK_SET) + set_inheritable(infd, True) + + with Popen([ffprobe, '-loglevel','quiet', '-i', '/proc/self/fd/%d' % infd, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe: out, _ = ffprobe.communicate() out = json.load(BytesIO(out)) if 'streams' in out: return out['streams'] else: logger.error('Impossible to retrieve format of file') - pass + + ffprobe.wait() + def extractSubTitleTrack(mkvmerge, inputFileName, index, lang): # mkvextract video.mkv tracks position:nom [position:nom] @@ -1228,16 +1304,16 @@ def main(): if formatOfFile == SupportedFormat.TS: logger.info("Converting TS to MP4 (to fix timestamps).") try: - with open(mp4filename, 'w') as mp4: - ffmpegConvert(paths['ffmpeg'], inputFile, 'mpegts', mp4, 'mp4', duration) + with open(mp4filename, 'w+') as mp4: + ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], inputFile, 'mpegts', mp4, 'mp4', duration) temporaries.append(mp4) logger.info("Converting MP4 to MKV.") try: - mkv = open(mkvfilename, 'w') + mkv = open(mkvfilename, 'w+') except IOError: logger.error('') - ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration) + ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration) if nbParts > 0: temporaries.append(mkv) except IOError: @@ -1249,7 +1325,7 @@ def main(): mkv = open(mkvfilename, 'w') except IOError: logger.error('') - ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration) + ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration) if nbParts > 0: temporaries.append(mkv) else: @@ -1368,8 +1444,18 @@ def main(): if not allOptionalTools: logger.warning("Missing tools for extracting subtitles.") else: - logger.info("Find subtitles tracks and language.") - subtitles = findSubtitlesTracks(args.outputFile) + try: + final = open(args.outputFile, mode='r') + except IOError: + logger.error("Impossible to open %s to finalize processing." % args.outputFile) + exit(-1) + + duration = getMovieDuration(paths['ffprobe'], final) + supportedLangs = getTesseractSupportedLang(paths['tesseract']) + logger.info('Supported lang: %s' % supportedLangs) + logger.info('Find subtitles tracks and language.') + subtitles = findSubtitlesTracks(paths['ffprobe'], final) + logger.info(subtitles) sts = {} for subtitle in subtitles: index = subtitle['index'] @@ -1384,20 +1470,28 @@ def main(): logger.error("Dropping subtitle: %s because it is missing language indication") else: logger.error("Dropping subtitle: %s because it is missing language indication") + + logger.debug(sts) + listOfSubtitles = extractSRT(paths['mkvextract'], args.outputFile, sts, supportedLangs) + logger.info(listOfSubtitles) + for idxName, subName, _, _ in listOfSubtitles: + try: + idx = open(idxName,'r') + except IOError: + logger.error("Impossible to open %s." % idxName) + exit(-1) + try: + sub = open(subName,'r') + except IOError: + logger.error("Impossible to open %s." % subName) + exit(-1) + + temporaries.append(idx) + temporaries.append(sub) - for lang in sts: - indexes = sts[lang] - if len(indexes) == 0: - # Nothing to do. This should not happen. - continue - if len(indexes) == 1: - index = indexes[0] - filename = 'essai-%s.srt' % lang - elif len(indexes) > 1: - nbsrt = 1 - for index in indexes: - filename = 'essai-%s-%d.srt' % (lang, nbsrt) - nbsrt+=1 + ocr = doOCR(paths['vobsubocr'], listOfSubtitles, duration, temporaries, args.dump) + logger.info(ocr) + if not args.keep: logger.info("Cleaning temporary files")