Subtitles extracted through OCR can be remuxed with the final cut movie (in addition to image based ones).

This commit is contained in:
Frédéric Tronel
2023-12-24 14:29:42 +01:00
parent ffce9aecdf
commit 889b8dd6dc

View File

@@ -16,7 +16,7 @@ import shutil
from tqdm import tqdm, trange
from select import select
from math import floor, ceil, log
from shutil import copyfile, which
from shutil import copyfile, which, move
import hexdump
from iso639 import Lang
from iso639.exceptions import InvalidLanguageValue
@@ -107,9 +107,10 @@ def getSubTitlesTracks(ffprobe, mkvPath):
return tracks
def extractSRT(mkvextract, mkvPath, subtitles, langs):
def extractSRT(mkvextract, fileName, subtitles, langs):
logger = logging.getLogger(__name__)
params = [mkvextract, mkvPath, 'tracks']
params = [mkvextract, fileName, 'tracks']
res = []
@@ -133,8 +134,20 @@ def extractSRT(mkvextract, mkvPath, subtitles, langs):
count = count+1
env = {**os.environ, 'LANG': 'C'}
with Popen(params, env=env) as extract:
extract.wait()
with Popen(params, stdout=PIPE, close_fds=False, env=env) as extract:
pb = tqdm(TextIOWrapper(extract.stdout, encoding="utf-8"), total=100, unit='%', desc='Extraction:')
for line in pb:
if line.startswith('Progress :'):
p = re.compile('^Progress : (?P<progress>[0-9]{1,3})%$')
m = p.match(line)
if m == None:
logger.error('Impossible to parse progress')
pb.update(int(m['progress'])-pb.n)
pb.update(100-pb.n)
pb.refresh()
pb.close()
extract.wait()
if extract.returncode != 0:
logger.error('Mkvextract returns an error code: %d' % extract.returncode)
@@ -981,6 +994,8 @@ def dumpPPM(pictures, prefix, temporaries):
nbBytes+=write(outfd, pictures[pos+nbBytes:pos+length])
pos+=length
picture+=1
def extractAllStreams(ffmpeg, ffprobe, inputFile, begin, end, streams, filesPrefix, nbFrames, width, height, temporaries, dumpMemFD=False):
logger = logging.getLogger(__name__)
@@ -988,6 +1003,7 @@ def extractAllStreams(ffmpeg, ffprobe, inputFile, begin, end, streams, filesPref
inputParams = []
codecsParams = []
if begin < end:
videoID=0
audioID=0
@@ -1001,6 +1017,8 @@ def extractAllStreams(ffmpeg, ffprobe, inputFile, begin, end, streams, filesPref
m = pattern.match(frameRate)
if m != None:
frameRate = float(m['numerator']) / float(m['denominator'])
# TODO: Framerate estimation seems broken ...
frameRate = 25.
sar = stream['sample_aspect_ratio']
dar = stream['display_aspect_ratio']
pixelFormat = stream['pix_fmt']
@@ -1217,6 +1235,52 @@ def extractSubTitleTrack(mkvmerge, inputFileName, index, lang):
for lines in out:
logger.info(out)
def remuxSRTSubtitles(mkvmerge, inputFile, outputFileName, subtitles):
logger = logging.getLogger(__name__)
try:
out = open(outputFileName, 'w')
except IOError:
logger.error('Impossible to create file: %s' % outputFileName)
return None
outfd = out.fileno()
infd = inputFile.fileno()
lseek(infd, 0, SEEK_SET)
set_inheritable(infd, True)
set_inheritable(outfd, True)
mkvmergeParams = [mkvmerge, '/proc/self/fd/%d' % infd]
for fd, lang in subtitles:
lseek(fd, 0, SEEK_SET)
set_inheritable(fd, True)
mkvmergeParams.extend(['--language', '0:%s' % lang, '/proc/self/fd/%d' % fd])
mkvmergeParams.extend(['-o', '/proc/self/fd/%d' % outfd])
warnings = []
env = {**os.environ, 'LANG': 'C'}
logger.info('Remux subtitles: %s' % mkvmergeParams)
with Popen(mkvmergeParams, stdout=PIPE, close_fds=False, env=env) as mkvmerge:
pb = tqdm(TextIOWrapper(mkvmerge.stdout, encoding="utf-8"), total=100, unit='%', desc='Remux subtitles:')
for line in pb:
if line.startswith('Progress :'):
p = re.compile('^Progress : (?P<progress>[0-9]{1,3})%$')
m = p.match(line)
if m == None:
logger.error('Impossible to parse progress')
pb.n = int(m['progress'])
pb.update()
elif line.startswith('Warning'):
warnings.append(line)
status = mkvmerge.wait()
if status == 1:
logger.warning('Extraction returns warning')
for w in warnings:
logger.warning(w)
elif status == 2:
logger.error('Extraction returns errors')
def main():
@@ -1431,30 +1495,35 @@ def main():
# We need to check the end also
checks.append(pos)
finalCutName = '%s-cut.mkv' % basename
nbMKVParts = len(mkvparts)
if nbMKVParts > 1:
logger.info('Merging: %s' % mkvparts)
mergeMKVs(inputs=mkvparts, outputName=args.outputFile)
mergeMKVs(inputs=mkvparts, outputName=finalCutName)
elif nbMKVParts == 1:
copyfile('part-1.mkv', args.outputFile)
copyfile('part-1.mkv', finalCutName)
else:
logger.info("Nothing else to do.")
try:
finalCut = open(finalCutName, mode='r')
except IOError:
logger.error("Impossible to open %s to finalize processing." % finalCutName)
exit(-1)
if args.srt:
if not allOptionalTools:
logger.warning("Missing tools for extracting subtitles.")
move(finalCutName, args.outputFile)
else:
try:
final = open(args.outputFile, mode='r')
except IOError:
logger.error("Impossible to open %s to finalize processing." % args.outputFile)
exit(-1)
duration = getMovieDuration(paths['ffprobe'], final)
# Final cut is not any more the final step.
temporaries.append(finalCut)
duration = getMovieDuration(paths['ffprobe'], finalCut)
supportedLangs = getTesseractSupportedLang(paths['tesseract'])
logger.info('Supported lang: %s' % supportedLangs)
logger.info('Find subtitles tracks and language.')
subtitles = findSubtitlesTracks(paths['ffprobe'], final)
subtitles = findSubtitlesTracks(paths['ffprobe'], finalCut)
logger.info(subtitles)
sts = {}
for subtitle in subtitles:
@@ -1472,7 +1541,7 @@ def main():
logger.error("Dropping subtitle: %s because it is missing language indication")
logger.debug(sts)
listOfSubtitles = extractSRT(paths['mkvextract'], args.outputFile, sts, supportedLangs)
listOfSubtitles = extractSRT(paths['mkvextract'], finalCutName, sts, supportedLangs)
logger.info(listOfSubtitles)
for idxName, subName, _, _ in listOfSubtitles:
try:
@@ -1491,7 +1560,11 @@ def main():
ocr = doOCR(paths['vobsubocr'], listOfSubtitles, duration, temporaries, args.dump)
logger.info(ocr)
# Remux SRT subtitles
remuxSRTSubtitles(paths['mkvmerge'], finalCut, args.outputFile, ocr)
else:
move(finalCutName, args.outputFile)
if not args.keep:
logger.info("Cleaning temporary files")