Handling of OCR to generate subtitles files is working.
This commit is contained in:
208
removeads.py
208
removeads.py
@@ -18,7 +18,8 @@ from select import select
|
|||||||
from math import floor, ceil, log
|
from math import floor, ceil, log
|
||||||
from shutil import copyfile, which
|
from shutil import copyfile, which
|
||||||
import hexdump
|
import hexdump
|
||||||
|
from iso639 import Lang
|
||||||
|
from iso639.exceptions import InvalidLanguageValue
|
||||||
|
|
||||||
# Useful SPS/PPS discussion
|
# Useful SPS/PPS discussion
|
||||||
# TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings.
|
# TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings.
|
||||||
@@ -106,13 +107,13 @@ def getSubTitlesTracks(ffprobe, mkvPath):
|
|||||||
|
|
||||||
return tracks
|
return tracks
|
||||||
|
|
||||||
def extractSRT(mkvextract, mkvPath, destPath, tracks, langs):
|
def extractSRT(mkvextract, mkvPath, subtitles, langs):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
params = [mkvextract, mkvPath, 'tracks']
|
params = [mkvextract, mkvPath, 'tracks']
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
|
|
||||||
for lang in tracks:
|
for lang in subtitles:
|
||||||
iso = Lang(lang)
|
iso = Lang(lang)
|
||||||
|
|
||||||
if iso in langs:
|
if iso in langs:
|
||||||
@@ -121,47 +122,80 @@ def extractSRT(mkvextract, mkvPath, destPath, tracks, langs):
|
|||||||
logger.warning("Language not supported by Tesseract: %s" % iso.name)
|
logger.warning("Language not supported by Tesseract: %s" % iso.name)
|
||||||
ocrlang ='osd'
|
ocrlang ='osd'
|
||||||
|
|
||||||
if len(tracks[lang]) == 1:
|
if len(subtitles[lang]) == 1:
|
||||||
params.append('%d:%s/%s' % (tracks[lang][0], destPath ,lang))
|
params.append('%d:%s' % (subtitles[lang][0], lang))
|
||||||
res.append(('%s/%s.idx' % (destPath, lang), lang, ocrlang))
|
res.append(('%s.idx' % lang, '%s.sub' % lang, lang, ocrlang))
|
||||||
else:
|
else:
|
||||||
count = 1
|
count = 1
|
||||||
for track in tracks[lang]:
|
for track in subtitles[lang]:
|
||||||
params.append('%d:%s/%s-%d' % (track, destPath, lang, count))
|
params.append('%d:%s-%d' % (track, lang, count))
|
||||||
res.append(('%s/%s-%d.idx' % (destPath, lang,count), lang, ocrlang))
|
res.append(('%s-%d.idx' % (lang,count), '%s-%d.sub' % (lang,count), lang, ocrlang))
|
||||||
count = count+1
|
count = count+1
|
||||||
|
|
||||||
with Popen(params) as extract:
|
env = {**os.environ, 'LANG': 'C'}
|
||||||
|
with Popen(params, env=env) as extract:
|
||||||
extract.wait()
|
extract.wait()
|
||||||
|
|
||||||
if extract.returncode != 0:
|
if extract.returncode != 0:
|
||||||
print("Erreur de mkvextract: %d" % extract.returncode)
|
logger.error('Mkvextract returns an error code: %d' % extract.returncode)
|
||||||
|
return None
|
||||||
else:
|
else:
|
||||||
print("Extracted")
|
logger.info('Subtitle tracks were succesfully extracted.')
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def doOCR(vobsubocr, idxs):
|
def doOCR(vobsubocr, idxs, duration, temporaries, dumpMemFD=False):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
res = []
|
res = []
|
||||||
|
|
||||||
for filename, lang, iso in idxs:
|
for idxName, subName, lang, iso in idxs:
|
||||||
print(filename)
|
srtname = '%s.srt' % os.path.splitext(idxName)[0]
|
||||||
srtname = '%s.srt' % os.path.splitext(filename)[0]
|
# Tesseract seems to recognize the three dots ... as "su"
|
||||||
print(srtname)
|
ldots = re.compile('^su\n$')
|
||||||
# Tesseract reconnaît la chaîne de caractères ... comme le texte 'su'
|
timestamps = re.compile('^[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} \-\-> (?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}),[0-9]{3}$')
|
||||||
p = re.compile('^su\n$')
|
|
||||||
|
|
||||||
if not os.path.isfile(srtname):
|
srtfd = memfd_create(srtname, flags=0)
|
||||||
with open(srtname, 'w+') as srt:
|
with Popen([vobsubocr, '--lang', iso, idxName], stdout=PIPE) as ocr:
|
||||||
with Popen([vobsubocr, '--lang', iso, filename], stdout=PIPE) as ocr:
|
pb = tqdm(TextIOWrapper(ocr.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='OCR')
|
||||||
for line in ocr.stdout:
|
for line in pb:
|
||||||
line = line.decode('utf8')
|
m = re.match(ldots,line)
|
||||||
m = re.match(p,line)
|
|
||||||
if m != None:
|
if m != None:
|
||||||
srt.write('...')
|
write(srtfd, '...'.encode(encoding='UTF-8'))
|
||||||
else:
|
else:
|
||||||
srt.write(line)
|
write(srtfd, line.encode(encoding='UTF-8'))
|
||||||
res.append((srtname, lang))
|
|
||||||
|
m = re.match(timestamps, line)
|
||||||
|
if m!=None:
|
||||||
|
hours = int(m.group('hours'))
|
||||||
|
minutes = int(m.group('hours'))
|
||||||
|
seconds = int(m.group('seconds'))
|
||||||
|
ts = timedelta(hours=hours, minutes=minutes, seconds=seconds)
|
||||||
|
pb.n = int(ts/timedelta(seconds=1))
|
||||||
|
pb.update()
|
||||||
|
|
||||||
|
status = ocr.wait()
|
||||||
|
|
||||||
|
if status != 0:
|
||||||
|
logger.error('OCR failed with status code: %d' % status)
|
||||||
|
|
||||||
|
if dumpMemFD:
|
||||||
|
try:
|
||||||
|
dumpSrt = open(srtname,'w')
|
||||||
|
except IOError:
|
||||||
|
logger.error('Impossible to create file: %s' % srtname)
|
||||||
|
return None
|
||||||
|
|
||||||
|
lseek(srtfd, 0, SEEK_SET)
|
||||||
|
srtLength = fstat(srtfd).st_size
|
||||||
|
buf = read(srtfd, srtLength)
|
||||||
|
outfd = dumpSrt.fileno()
|
||||||
|
pos = 0
|
||||||
|
while pos < srtLength:
|
||||||
|
pos+=write(outfd, buf[pos:])
|
||||||
|
|
||||||
|
temporaries.append(dumpSrt)
|
||||||
|
|
||||||
|
res.append((srtfd, lang))
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
@@ -498,6 +532,44 @@ def getFormat(ffprobe, inputFile):
|
|||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def getMovieDuration(ffprobe, inputFile):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
infd = inputFile.fileno()
|
||||||
|
lseek(infd, 0, SEEK_SET)
|
||||||
|
set_inheritable(infd, True)
|
||||||
|
with Popen([ffprobe, '-loglevel', 'quiet', '-show_format', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe:
|
||||||
|
out, _ = ffprobe.communicate()
|
||||||
|
out = json.load(BytesIO(out))
|
||||||
|
if 'format' in out and 'duration' in out['format']:
|
||||||
|
duration = floor(float(out['format']['duration']))
|
||||||
|
ts = timedelta(seconds=duration)
|
||||||
|
return ts
|
||||||
|
else:
|
||||||
|
logger.error('Impossible to retrieve duration of movie')
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ffprobe -loglevel quiet -select_streams v:0 -show_entries stream=width,height -of json ./talons.ts
|
||||||
|
def getVideoDimensions(ffprobe, inputFile):
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
infd = inputFile.fileno()
|
||||||
|
lseek(infd, 0, SEEK_SET)
|
||||||
|
set_inheritable(infd, True)
|
||||||
|
with Popen([ffprobe, '-loglevel', 'quiet', '-select_streams', 'v:0', '-show_entries', 'stream=width,height', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe:
|
||||||
|
out, _ = ffprobe.communicate()
|
||||||
|
out = json.load(BytesIO(out))
|
||||||
|
if 'streams' in out:
|
||||||
|
video = out['streams'][0]
|
||||||
|
if ('width' in video) and ('height' in video):
|
||||||
|
return int(video['width']), int(video['height'])
|
||||||
|
|
||||||
|
logger.error('Impossible to retrieve dimensions of video')
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
|
|
||||||
def getStreams(ffprobe, inputFile):
|
def getStreams(ffprobe, inputFile):
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -637,17 +709,16 @@ def compareTimeInterval(interval1, interval2):
|
|||||||
else:
|
else:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
def ffmpegConvert(ffmpeg, ffprobe, inputFile, inputFormat, outputFile, outputFormat, duration):
|
||||||
|
|
||||||
def ffmpegConvert(ffmpeg, inputFile, inputFormat, outputFile, outputFormat, duration):
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
width, height = getVideoDimensions(ffprobe, inputFile)
|
||||||
|
|
||||||
infd = inputFile.fileno()
|
infd = inputFile.fileno()
|
||||||
outfd = outputFile.fileno()
|
outfd = outputFile.fileno()
|
||||||
set_inheritable(infd, True)
|
set_inheritable(infd, True)
|
||||||
set_inheritable(outfd, True)
|
set_inheritable(outfd, True)
|
||||||
# TODO: canvas size to be fixed !
|
with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '%dx%d' % (width, height), '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,
|
||||||
with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '720x560', '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,
|
|
||||||
'-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub',
|
'-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub',
|
||||||
'-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg:
|
'-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg:
|
||||||
pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion')
|
pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion')
|
||||||
@@ -1119,18 +1190,23 @@ def mergeMKVs(inputs, outputName):
|
|||||||
|
|
||||||
return out
|
return out
|
||||||
|
|
||||||
def findSubtitlesTracks(ffprobe, filename):
|
def findSubtitlesTracks(ffprobe, inputFile):
|
||||||
# ffprobe -loglevel quiet -select_streams s -show_entries stream=index:stream_tags=language -of json corgi.ts
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
with Popen([ffprobe, '-i', filename, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe:
|
infd = inputFile.fileno()
|
||||||
|
lseek(infd, 0, SEEK_SET)
|
||||||
|
set_inheritable(infd, True)
|
||||||
|
|
||||||
|
with Popen([ffprobe, '-loglevel','quiet', '-i', '/proc/self/fd/%d' % infd, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe:
|
||||||
out, _ = ffprobe.communicate()
|
out, _ = ffprobe.communicate()
|
||||||
out = json.load(BytesIO(out))
|
out = json.load(BytesIO(out))
|
||||||
if 'streams' in out:
|
if 'streams' in out:
|
||||||
return out['streams']
|
return out['streams']
|
||||||
else:
|
else:
|
||||||
logger.error('Impossible to retrieve format of file')
|
logger.error('Impossible to retrieve format of file')
|
||||||
pass
|
|
||||||
|
ffprobe.wait()
|
||||||
|
|
||||||
|
|
||||||
def extractSubTitleTrack(mkvmerge, inputFileName, index, lang):
|
def extractSubTitleTrack(mkvmerge, inputFileName, index, lang):
|
||||||
# mkvextract video.mkv tracks position:nom [position:nom]
|
# mkvextract video.mkv tracks position:nom [position:nom]
|
||||||
@@ -1228,16 +1304,16 @@ def main():
|
|||||||
if formatOfFile == SupportedFormat.TS:
|
if formatOfFile == SupportedFormat.TS:
|
||||||
logger.info("Converting TS to MP4 (to fix timestamps).")
|
logger.info("Converting TS to MP4 (to fix timestamps).")
|
||||||
try:
|
try:
|
||||||
with open(mp4filename, 'w') as mp4:
|
with open(mp4filename, 'w+') as mp4:
|
||||||
ffmpegConvert(paths['ffmpeg'], inputFile, 'mpegts', mp4, 'mp4', duration)
|
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], inputFile, 'mpegts', mp4, 'mp4', duration)
|
||||||
temporaries.append(mp4)
|
temporaries.append(mp4)
|
||||||
logger.info("Converting MP4 to MKV.")
|
logger.info("Converting MP4 to MKV.")
|
||||||
try:
|
try:
|
||||||
mkv = open(mkvfilename, 'w')
|
mkv = open(mkvfilename, 'w+')
|
||||||
except IOError:
|
except IOError:
|
||||||
logger.error('')
|
logger.error('')
|
||||||
|
|
||||||
ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration)
|
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration)
|
||||||
if nbParts > 0:
|
if nbParts > 0:
|
||||||
temporaries.append(mkv)
|
temporaries.append(mkv)
|
||||||
except IOError:
|
except IOError:
|
||||||
@@ -1249,7 +1325,7 @@ def main():
|
|||||||
mkv = open(mkvfilename, 'w')
|
mkv = open(mkvfilename, 'w')
|
||||||
except IOError:
|
except IOError:
|
||||||
logger.error('')
|
logger.error('')
|
||||||
ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration)
|
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration)
|
||||||
if nbParts > 0:
|
if nbParts > 0:
|
||||||
temporaries.append(mkv)
|
temporaries.append(mkv)
|
||||||
else:
|
else:
|
||||||
@@ -1368,8 +1444,18 @@ def main():
|
|||||||
if not allOptionalTools:
|
if not allOptionalTools:
|
||||||
logger.warning("Missing tools for extracting subtitles.")
|
logger.warning("Missing tools for extracting subtitles.")
|
||||||
else:
|
else:
|
||||||
logger.info("Find subtitles tracks and language.")
|
try:
|
||||||
subtitles = findSubtitlesTracks(args.outputFile)
|
final = open(args.outputFile, mode='r')
|
||||||
|
except IOError:
|
||||||
|
logger.error("Impossible to open %s to finalize processing." % args.outputFile)
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
|
duration = getMovieDuration(paths['ffprobe'], final)
|
||||||
|
supportedLangs = getTesseractSupportedLang(paths['tesseract'])
|
||||||
|
logger.info('Supported lang: %s' % supportedLangs)
|
||||||
|
logger.info('Find subtitles tracks and language.')
|
||||||
|
subtitles = findSubtitlesTracks(paths['ffprobe'], final)
|
||||||
|
logger.info(subtitles)
|
||||||
sts = {}
|
sts = {}
|
||||||
for subtitle in subtitles:
|
for subtitle in subtitles:
|
||||||
index = subtitle['index']
|
index = subtitle['index']
|
||||||
@@ -1385,19 +1471,27 @@ def main():
|
|||||||
else:
|
else:
|
||||||
logger.error("Dropping subtitle: %s because it is missing language indication")
|
logger.error("Dropping subtitle: %s because it is missing language indication")
|
||||||
|
|
||||||
for lang in sts:
|
logger.debug(sts)
|
||||||
indexes = sts[lang]
|
listOfSubtitles = extractSRT(paths['mkvextract'], args.outputFile, sts, supportedLangs)
|
||||||
if len(indexes) == 0:
|
logger.info(listOfSubtitles)
|
||||||
# Nothing to do. This should not happen.
|
for idxName, subName, _, _ in listOfSubtitles:
|
||||||
continue
|
try:
|
||||||
if len(indexes) == 1:
|
idx = open(idxName,'r')
|
||||||
index = indexes[0]
|
except IOError:
|
||||||
filename = 'essai-%s.srt' % lang
|
logger.error("Impossible to open %s." % idxName)
|
||||||
elif len(indexes) > 1:
|
exit(-1)
|
||||||
nbsrt = 1
|
try:
|
||||||
for index in indexes:
|
sub = open(subName,'r')
|
||||||
filename = 'essai-%s-%d.srt' % (lang, nbsrt)
|
except IOError:
|
||||||
nbsrt+=1
|
logger.error("Impossible to open %s." % subName)
|
||||||
|
exit(-1)
|
||||||
|
|
||||||
|
temporaries.append(idx)
|
||||||
|
temporaries.append(sub)
|
||||||
|
|
||||||
|
ocr = doOCR(paths['vobsubocr'], listOfSubtitles, duration, temporaries, args.dump)
|
||||||
|
logger.info(ocr)
|
||||||
|
|
||||||
|
|
||||||
if not args.keep:
|
if not args.keep:
|
||||||
logger.info("Cleaning temporary files")
|
logger.info("Cleaning temporary files")
|
||||||
|
|||||||
Reference in New Issue
Block a user