Compare commits

..

2 Commits

Author SHA1 Message Date
Frédéric Tronel
ffce9aecdf Handling of OCR to generate subtitles files is working. 2023-12-22 14:57:25 +01:00
Frédéric Tronel
4dbf9d9c03 Suppress SRT files for cleaning. 2023-12-22 14:56:05 +01:00
2 changed files with 154 additions and 60 deletions

View File

@@ -1,2 +1,2 @@
clean: clean:
rm -f *.ppm *.pcm part* rm -f *.ppm *.pcm part* *.srt

View File

@@ -18,7 +18,8 @@ from select import select
from math import floor, ceil, log from math import floor, ceil, log
from shutil import copyfile, which from shutil import copyfile, which
import hexdump import hexdump
from iso639 import Lang
from iso639.exceptions import InvalidLanguageValue
# Useful SPS/PPS discussion # Useful SPS/PPS discussion
# TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings. # TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings.
@@ -106,13 +107,13 @@ def getSubTitlesTracks(ffprobe, mkvPath):
return tracks return tracks
def extractSRT(mkvextract, mkvPath, destPath, tracks, langs): def extractSRT(mkvextract, mkvPath, subtitles, langs):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
params = [mkvextract, mkvPath, 'tracks'] params = [mkvextract, mkvPath, 'tracks']
res = [] res = []
for lang in tracks: for lang in subtitles:
iso = Lang(lang) iso = Lang(lang)
if iso in langs: if iso in langs:
@@ -121,47 +122,80 @@ def extractSRT(mkvextract, mkvPath, destPath, tracks, langs):
logger.warning("Language not supported by Tesseract: %s" % iso.name) logger.warning("Language not supported by Tesseract: %s" % iso.name)
ocrlang ='osd' ocrlang ='osd'
if len(tracks[lang]) == 1: if len(subtitles[lang]) == 1:
params.append('%d:%s/%s' % (tracks[lang][0], destPath ,lang)) params.append('%d:%s' % (subtitles[lang][0], lang))
res.append(('%s/%s.idx' % (destPath, lang), lang, ocrlang)) res.append(('%s.idx' % lang, '%s.sub' % lang, lang, ocrlang))
else: else:
count = 1 count = 1
for track in tracks[lang]: for track in subtitles[lang]:
params.append('%d:%s/%s-%d' % (track, destPath, lang, count)) params.append('%d:%s-%d' % (track, lang, count))
res.append(('%s/%s-%d.idx' % (destPath, lang,count), lang, ocrlang)) res.append(('%s-%d.idx' % (lang,count), '%s-%d.sub' % (lang,count), lang, ocrlang))
count = count+1 count = count+1
with Popen(params) as extract: env = {**os.environ, 'LANG': 'C'}
with Popen(params, env=env) as extract:
extract.wait() extract.wait()
if extract.returncode != 0: if extract.returncode != 0:
print("Erreur de mkvextract: %d" % extract.returncode) logger.error('Mkvextract returns an error code: %d' % extract.returncode)
return None
else: else:
print("Extracted") logger.info('Subtitle tracks were succesfully extracted.')
return res return res
def doOCR(vobsubocr, idxs): def doOCR(vobsubocr, idxs, duration, temporaries, dumpMemFD=False):
logger = logging.getLogger(__name__)
res = [] res = []
for filename, lang, iso in idxs: for idxName, subName, lang, iso in idxs:
print(filename) srtname = '%s.srt' % os.path.splitext(idxName)[0]
srtname = '%s.srt' % os.path.splitext(filename)[0] # Tesseract seems to recognize the three dots ... as "su"
print(srtname) ldots = re.compile('^su\n$')
# Tesseract reconnaît la chaîne de caractères ... comme le texte 'su' timestamps = re.compile('^[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} \-\-> (?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}),[0-9]{3}$')
p = re.compile('^su\n$')
if not os.path.isfile(srtname): srtfd = memfd_create(srtname, flags=0)
with open(srtname, 'w+') as srt: with Popen([vobsubocr, '--lang', iso, idxName], stdout=PIPE) as ocr:
with Popen([vobsubocr, '--lang', iso, filename], stdout=PIPE) as ocr: pb = tqdm(TextIOWrapper(ocr.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='OCR')
for line in ocr.stdout: for line in pb:
line = line.decode('utf8') m = re.match(ldots,line)
m = re.match(p,line) if m != None:
if m != None: write(srtfd, '...'.encode(encoding='UTF-8'))
srt.write('...') else:
else: write(srtfd, line.encode(encoding='UTF-8'))
srt.write(line)
res.append((srtname, lang)) m = re.match(timestamps, line)
if m!=None:
hours = int(m.group('hours'))
minutes = int(m.group('hours'))
seconds = int(m.group('seconds'))
ts = timedelta(hours=hours, minutes=minutes, seconds=seconds)
pb.n = int(ts/timedelta(seconds=1))
pb.update()
status = ocr.wait()
if status != 0:
logger.error('OCR failed with status code: %d' % status)
if dumpMemFD:
try:
dumpSrt = open(srtname,'w')
except IOError:
logger.error('Impossible to create file: %s' % srtname)
return None
lseek(srtfd, 0, SEEK_SET)
srtLength = fstat(srtfd).st_size
buf = read(srtfd, srtLength)
outfd = dumpSrt.fileno()
pos = 0
while pos < srtLength:
pos+=write(outfd, buf[pos:])
temporaries.append(dumpSrt)
res.append((srtfd, lang))
return res return res
@@ -498,6 +532,44 @@ def getFormat(ffprobe, inputFile):
return None return None
def getMovieDuration(ffprobe, inputFile):
logger = logging.getLogger(__name__)
infd = inputFile.fileno()
lseek(infd, 0, SEEK_SET)
set_inheritable(infd, True)
with Popen([ffprobe, '-loglevel', 'quiet', '-show_format', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe:
out, _ = ffprobe.communicate()
out = json.load(BytesIO(out))
if 'format' in out and 'duration' in out['format']:
duration = floor(float(out['format']['duration']))
ts = timedelta(seconds=duration)
return ts
else:
logger.error('Impossible to retrieve duration of movie')
return None
# ffprobe -loglevel quiet -select_streams v:0 -show_entries stream=width,height -of json ./talons.ts
def getVideoDimensions(ffprobe, inputFile):
logger = logging.getLogger(__name__)
infd = inputFile.fileno()
lseek(infd, 0, SEEK_SET)
set_inheritable(infd, True)
with Popen([ffprobe, '-loglevel', 'quiet', '-select_streams', 'v:0', '-show_entries', 'stream=width,height', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe:
out, _ = ffprobe.communicate()
out = json.load(BytesIO(out))
if 'streams' in out:
video = out['streams'][0]
if ('width' in video) and ('height' in video):
return int(video['width']), int(video['height'])
logger.error('Impossible to retrieve dimensions of video')
exit(-1)
def getStreams(ffprobe, inputFile): def getStreams(ffprobe, inputFile):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -637,17 +709,16 @@ def compareTimeInterval(interval1, interval2):
else: else:
return 0 return 0
def ffmpegConvert(ffmpeg, ffprobe, inputFile, inputFormat, outputFile, outputFormat, duration):
def ffmpegConvert(ffmpeg, inputFile, inputFormat, outputFile, outputFormat, duration):
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
width, height = getVideoDimensions(ffprobe, inputFile)
infd = inputFile.fileno() infd = inputFile.fileno()
outfd = outputFile.fileno() outfd = outputFile.fileno()
set_inheritable(infd, True) set_inheritable(infd, True)
set_inheritable(outfd, True) set_inheritable(outfd, True)
# TODO: canvas size to be fixed ! with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '%dx%d' % (width, height), '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,
with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '720x560', '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,
'-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub', '-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub',
'-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg: '-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg:
pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion') pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion')
@@ -1119,18 +1190,23 @@ def mergeMKVs(inputs, outputName):
return out return out
def findSubtitlesTracks(ffprobe, filename): def findSubtitlesTracks(ffprobe, inputFile):
# ffprobe -loglevel quiet -select_streams s -show_entries stream=index:stream_tags=language -of json corgi.ts
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
with Popen([ffprobe, '-i', filename, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe: infd = inputFile.fileno()
lseek(infd, 0, SEEK_SET)
set_inheritable(infd, True)
with Popen([ffprobe, '-loglevel','quiet', '-i', '/proc/self/fd/%d' % infd, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe:
out, _ = ffprobe.communicate() out, _ = ffprobe.communicate()
out = json.load(BytesIO(out)) out = json.load(BytesIO(out))
if 'streams' in out: if 'streams' in out:
return out['streams'] return out['streams']
else: else:
logger.error('Impossible to retrieve format of file') logger.error('Impossible to retrieve format of file')
pass
ffprobe.wait()
def extractSubTitleTrack(mkvmerge, inputFileName, index, lang): def extractSubTitleTrack(mkvmerge, inputFileName, index, lang):
# mkvextract video.mkv tracks position:nom [position:nom] # mkvextract video.mkv tracks position:nom [position:nom]
@@ -1228,16 +1304,16 @@ def main():
if formatOfFile == SupportedFormat.TS: if formatOfFile == SupportedFormat.TS:
logger.info("Converting TS to MP4 (to fix timestamps).") logger.info("Converting TS to MP4 (to fix timestamps).")
try: try:
with open(mp4filename, 'w') as mp4: with open(mp4filename, 'w+') as mp4:
ffmpegConvert(paths['ffmpeg'], inputFile, 'mpegts', mp4, 'mp4', duration) ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], inputFile, 'mpegts', mp4, 'mp4', duration)
temporaries.append(mp4) temporaries.append(mp4)
logger.info("Converting MP4 to MKV.") logger.info("Converting MP4 to MKV.")
try: try:
mkv = open(mkvfilename, 'w') mkv = open(mkvfilename, 'w+')
except IOError: except IOError:
logger.error('') logger.error('')
ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration) ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration)
if nbParts > 0: if nbParts > 0:
temporaries.append(mkv) temporaries.append(mkv)
except IOError: except IOError:
@@ -1249,7 +1325,7 @@ def main():
mkv = open(mkvfilename, 'w') mkv = open(mkvfilename, 'w')
except IOError: except IOError:
logger.error('') logger.error('')
ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration) ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration)
if nbParts > 0: if nbParts > 0:
temporaries.append(mkv) temporaries.append(mkv)
else: else:
@@ -1368,8 +1444,18 @@ def main():
if not allOptionalTools: if not allOptionalTools:
logger.warning("Missing tools for extracting subtitles.") logger.warning("Missing tools for extracting subtitles.")
else: else:
logger.info("Find subtitles tracks and language.") try:
subtitles = findSubtitlesTracks(args.outputFile) final = open(args.outputFile, mode='r')
except IOError:
logger.error("Impossible to open %s to finalize processing." % args.outputFile)
exit(-1)
duration = getMovieDuration(paths['ffprobe'], final)
supportedLangs = getTesseractSupportedLang(paths['tesseract'])
logger.info('Supported lang: %s' % supportedLangs)
logger.info('Find subtitles tracks and language.')
subtitles = findSubtitlesTracks(paths['ffprobe'], final)
logger.info(subtitles)
sts = {} sts = {}
for subtitle in subtitles: for subtitle in subtitles:
index = subtitle['index'] index = subtitle['index']
@@ -1384,20 +1470,28 @@ def main():
logger.error("Dropping subtitle: %s because it is missing language indication") logger.error("Dropping subtitle: %s because it is missing language indication")
else: else:
logger.error("Dropping subtitle: %s because it is missing language indication") logger.error("Dropping subtitle: %s because it is missing language indication")
logger.debug(sts)
listOfSubtitles = extractSRT(paths['mkvextract'], args.outputFile, sts, supportedLangs)
logger.info(listOfSubtitles)
for idxName, subName, _, _ in listOfSubtitles:
try:
idx = open(idxName,'r')
except IOError:
logger.error("Impossible to open %s." % idxName)
exit(-1)
try:
sub = open(subName,'r')
except IOError:
logger.error("Impossible to open %s." % subName)
exit(-1)
temporaries.append(idx)
temporaries.append(sub)
for lang in sts: ocr = doOCR(paths['vobsubocr'], listOfSubtitles, duration, temporaries, args.dump)
indexes = sts[lang] logger.info(ocr)
if len(indexes) == 0:
# Nothing to do. This should not happen.
continue
if len(indexes) == 1:
index = indexes[0]
filename = 'essai-%s.srt' % lang
elif len(indexes) > 1:
nbsrt = 1
for index in indexes:
filename = 'essai-%s-%d.srt' % (lang, nbsrt)
nbsrt+=1
if not args.keep: if not args.keep:
logger.info("Cleaning temporary files") logger.info("Cleaning temporary files")