Compare commits

...

2 Commits

Author SHA1 Message Date
Frédéric Tronel
ffce9aecdf Handling of OCR to generate subtitles files is working. 2023-12-22 14:57:25 +01:00
Frédéric Tronel
4dbf9d9c03 Suppress SRT files for cleaning. 2023-12-22 14:56:05 +01:00
2 changed files with 154 additions and 60 deletions

View File

@@ -1,2 +1,2 @@
clean:
rm -f *.ppm *.pcm part*
rm -f *.ppm *.pcm part* *.srt

View File

@@ -18,7 +18,8 @@ from select import select
from math import floor, ceil, log
from shutil import copyfile, which
import hexdump
from iso639 import Lang
from iso639.exceptions import InvalidLanguageValue
# Useful SPS/PPS discussion
# TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings.
@@ -106,13 +107,13 @@ def getSubTitlesTracks(ffprobe, mkvPath):
return tracks
def extractSRT(mkvextract, mkvPath, destPath, tracks, langs):
def extractSRT(mkvextract, mkvPath, subtitles, langs):
logger = logging.getLogger(__name__)
params = [mkvextract, mkvPath, 'tracks']
res = []
for lang in tracks:
for lang in subtitles:
iso = Lang(lang)
if iso in langs:
@@ -121,47 +122,80 @@ def extractSRT(mkvextract, mkvPath, destPath, tracks, langs):
logger.warning("Language not supported by Tesseract: %s" % iso.name)
ocrlang ='osd'
if len(tracks[lang]) == 1:
params.append('%d:%s/%s' % (tracks[lang][0], destPath ,lang))
res.append(('%s/%s.idx' % (destPath, lang), lang, ocrlang))
if len(subtitles[lang]) == 1:
params.append('%d:%s' % (subtitles[lang][0], lang))
res.append(('%s.idx' % lang, '%s.sub' % lang, lang, ocrlang))
else:
count = 1
for track in tracks[lang]:
params.append('%d:%s/%s-%d' % (track, destPath, lang, count))
res.append(('%s/%s-%d.idx' % (destPath, lang,count), lang, ocrlang))
for track in subtitles[lang]:
params.append('%d:%s-%d' % (track, lang, count))
res.append(('%s-%d.idx' % (lang,count), '%s-%d.sub' % (lang,count), lang, ocrlang))
count = count+1
with Popen(params) as extract:
env = {**os.environ, 'LANG': 'C'}
with Popen(params, env=env) as extract:
extract.wait()
if extract.returncode != 0:
print("Erreur de mkvextract: %d" % extract.returncode)
logger.error('Mkvextract returns an error code: %d' % extract.returncode)
return None
else:
print("Extracted")
logger.info('Subtitle tracks were succesfully extracted.')
return res
def doOCR(vobsubocr, idxs):
def doOCR(vobsubocr, idxs, duration, temporaries, dumpMemFD=False):
logger = logging.getLogger(__name__)
res = []
for filename, lang, iso in idxs:
print(filename)
srtname = '%s.srt' % os.path.splitext(filename)[0]
print(srtname)
# Tesseract reconnaît la chaîne de caractères ... comme le texte 'su'
p = re.compile('^su\n$')
for idxName, subName, lang, iso in idxs:
srtname = '%s.srt' % os.path.splitext(idxName)[0]
# Tesseract seems to recognize the three dots ... as "su"
ldots = re.compile('^su\n$')
timestamps = re.compile('^[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} \-\-> (?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}),[0-9]{3}$')
if not os.path.isfile(srtname):
with open(srtname, 'w+') as srt:
with Popen([vobsubocr, '--lang', iso, filename], stdout=PIPE) as ocr:
for line in ocr.stdout:
line = line.decode('utf8')
m = re.match(p,line)
if m != None:
srt.write('...')
else:
srt.write(line)
res.append((srtname, lang))
srtfd = memfd_create(srtname, flags=0)
with Popen([vobsubocr, '--lang', iso, idxName], stdout=PIPE) as ocr:
pb = tqdm(TextIOWrapper(ocr.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='OCR')
for line in pb:
m = re.match(ldots,line)
if m != None:
write(srtfd, '...'.encode(encoding='UTF-8'))
else:
write(srtfd, line.encode(encoding='UTF-8'))
m = re.match(timestamps, line)
if m!=None:
hours = int(m.group('hours'))
minutes = int(m.group('hours'))
seconds = int(m.group('seconds'))
ts = timedelta(hours=hours, minutes=minutes, seconds=seconds)
pb.n = int(ts/timedelta(seconds=1))
pb.update()
status = ocr.wait()
if status != 0:
logger.error('OCR failed with status code: %d' % status)
if dumpMemFD:
try:
dumpSrt = open(srtname,'w')
except IOError:
logger.error('Impossible to create file: %s' % srtname)
return None
lseek(srtfd, 0, SEEK_SET)
srtLength = fstat(srtfd).st_size
buf = read(srtfd, srtLength)
outfd = dumpSrt.fileno()
pos = 0
while pos < srtLength:
pos+=write(outfd, buf[pos:])
temporaries.append(dumpSrt)
res.append((srtfd, lang))
return res
@@ -498,6 +532,44 @@ def getFormat(ffprobe, inputFile):
return None
def getMovieDuration(ffprobe, inputFile):
logger = logging.getLogger(__name__)
infd = inputFile.fileno()
lseek(infd, 0, SEEK_SET)
set_inheritable(infd, True)
with Popen([ffprobe, '-loglevel', 'quiet', '-show_format', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe:
out, _ = ffprobe.communicate()
out = json.load(BytesIO(out))
if 'format' in out and 'duration' in out['format']:
duration = floor(float(out['format']['duration']))
ts = timedelta(seconds=duration)
return ts
else:
logger.error('Impossible to retrieve duration of movie')
return None
# ffprobe -loglevel quiet -select_streams v:0 -show_entries stream=width,height -of json ./talons.ts
def getVideoDimensions(ffprobe, inputFile):
logger = logging.getLogger(__name__)
infd = inputFile.fileno()
lseek(infd, 0, SEEK_SET)
set_inheritable(infd, True)
with Popen([ffprobe, '-loglevel', 'quiet', '-select_streams', 'v:0', '-show_entries', 'stream=width,height', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe:
out, _ = ffprobe.communicate()
out = json.load(BytesIO(out))
if 'streams' in out:
video = out['streams'][0]
if ('width' in video) and ('height' in video):
return int(video['width']), int(video['height'])
logger.error('Impossible to retrieve dimensions of video')
exit(-1)
def getStreams(ffprobe, inputFile):
logger = logging.getLogger(__name__)
@@ -637,17 +709,16 @@ def compareTimeInterval(interval1, interval2):
else:
return 0
def ffmpegConvert(ffmpeg, inputFile, inputFormat, outputFile, outputFormat, duration):
def ffmpegConvert(ffmpeg, ffprobe, inputFile, inputFormat, outputFile, outputFormat, duration):
logger = logging.getLogger(__name__)
width, height = getVideoDimensions(ffprobe, inputFile)
infd = inputFile.fileno()
outfd = outputFile.fileno()
set_inheritable(infd, True)
set_inheritable(outfd, True)
# TODO: canvas size to be fixed !
with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '720x560', '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,
with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '%dx%d' % (width, height), '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,
'-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub',
'-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg:
pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion')
@@ -1119,18 +1190,23 @@ def mergeMKVs(inputs, outputName):
return out
def findSubtitlesTracks(ffprobe, filename):
# ffprobe -loglevel quiet -select_streams s -show_entries stream=index:stream_tags=language -of json corgi.ts
def findSubtitlesTracks(ffprobe, inputFile):
logger = logging.getLogger(__name__)
with Popen([ffprobe, '-i', filename, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe:
infd = inputFile.fileno()
lseek(infd, 0, SEEK_SET)
set_inheritable(infd, True)
with Popen([ffprobe, '-loglevel','quiet', '-i', '/proc/self/fd/%d' % infd, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe:
out, _ = ffprobe.communicate()
out = json.load(BytesIO(out))
if 'streams' in out:
return out['streams']
else:
logger.error('Impossible to retrieve format of file')
pass
ffprobe.wait()
def extractSubTitleTrack(mkvmerge, inputFileName, index, lang):
# mkvextract video.mkv tracks position:nom [position:nom]
@@ -1228,16 +1304,16 @@ def main():
if formatOfFile == SupportedFormat.TS:
logger.info("Converting TS to MP4 (to fix timestamps).")
try:
with open(mp4filename, 'w') as mp4:
ffmpegConvert(paths['ffmpeg'], inputFile, 'mpegts', mp4, 'mp4', duration)
with open(mp4filename, 'w+') as mp4:
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], inputFile, 'mpegts', mp4, 'mp4', duration)
temporaries.append(mp4)
logger.info("Converting MP4 to MKV.")
try:
mkv = open(mkvfilename, 'w')
mkv = open(mkvfilename, 'w+')
except IOError:
logger.error('')
ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration)
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration)
if nbParts > 0:
temporaries.append(mkv)
except IOError:
@@ -1249,7 +1325,7 @@ def main():
mkv = open(mkvfilename, 'w')
except IOError:
logger.error('')
ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration)
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration)
if nbParts > 0:
temporaries.append(mkv)
else:
@@ -1368,8 +1444,18 @@ def main():
if not allOptionalTools:
logger.warning("Missing tools for extracting subtitles.")
else:
logger.info("Find subtitles tracks and language.")
subtitles = findSubtitlesTracks(args.outputFile)
try:
final = open(args.outputFile, mode='r')
except IOError:
logger.error("Impossible to open %s to finalize processing." % args.outputFile)
exit(-1)
duration = getMovieDuration(paths['ffprobe'], final)
supportedLangs = getTesseractSupportedLang(paths['tesseract'])
logger.info('Supported lang: %s' % supportedLangs)
logger.info('Find subtitles tracks and language.')
subtitles = findSubtitlesTracks(paths['ffprobe'], final)
logger.info(subtitles)
sts = {}
for subtitle in subtitles:
index = subtitle['index']
@@ -1384,20 +1470,28 @@ def main():
logger.error("Dropping subtitle: %s because it is missing language indication")
else:
logger.error("Dropping subtitle: %s because it is missing language indication")
logger.debug(sts)
listOfSubtitles = extractSRT(paths['mkvextract'], args.outputFile, sts, supportedLangs)
logger.info(listOfSubtitles)
for idxName, subName, _, _ in listOfSubtitles:
try:
idx = open(idxName,'r')
except IOError:
logger.error("Impossible to open %s." % idxName)
exit(-1)
try:
sub = open(subName,'r')
except IOError:
logger.error("Impossible to open %s." % subName)
exit(-1)
temporaries.append(idx)
temporaries.append(sub)
for lang in sts:
indexes = sts[lang]
if len(indexes) == 0:
# Nothing to do. This should not happen.
continue
if len(indexes) == 1:
index = indexes[0]
filename = 'essai-%s.srt' % lang
elif len(indexes) > 1:
nbsrt = 1
for index in indexes:
filename = 'essai-%s-%d.srt' % (lang, nbsrt)
nbsrt+=1
ocr = doOCR(paths['vobsubocr'], listOfSubtitles, duration, temporaries, args.dump)
logger.info(ocr)
if not args.keep:
logger.info("Cleaning temporary files")