Compare commits
2 Commits
03922a76d2
...
ffce9aecdf
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ffce9aecdf | ||
|
|
4dbf9d9c03 |
212
removeads.py
212
removeads.py
@@ -18,7 +18,8 @@ from select import select
|
||||
from math import floor, ceil, log
|
||||
from shutil import copyfile, which
|
||||
import hexdump
|
||||
|
||||
from iso639 import Lang
|
||||
from iso639.exceptions import InvalidLanguageValue
|
||||
|
||||
# Useful SPS/PPS discussion
|
||||
# TODO: improve situation of SPS and PPS header mismatch when merging MVK with mkvmerge to remove warnings.
|
||||
@@ -106,13 +107,13 @@ def getSubTitlesTracks(ffprobe, mkvPath):
|
||||
|
||||
return tracks
|
||||
|
||||
def extractSRT(mkvextract, mkvPath, destPath, tracks, langs):
|
||||
def extractSRT(mkvextract, mkvPath, subtitles, langs):
|
||||
logger = logging.getLogger(__name__)
|
||||
params = [mkvextract, mkvPath, 'tracks']
|
||||
|
||||
res = []
|
||||
|
||||
for lang in tracks:
|
||||
for lang in subtitles:
|
||||
iso = Lang(lang)
|
||||
|
||||
if iso in langs:
|
||||
@@ -121,47 +122,80 @@ def extractSRT(mkvextract, mkvPath, destPath, tracks, langs):
|
||||
logger.warning("Language not supported by Tesseract: %s" % iso.name)
|
||||
ocrlang ='osd'
|
||||
|
||||
if len(tracks[lang]) == 1:
|
||||
params.append('%d:%s/%s' % (tracks[lang][0], destPath ,lang))
|
||||
res.append(('%s/%s.idx' % (destPath, lang), lang, ocrlang))
|
||||
if len(subtitles[lang]) == 1:
|
||||
params.append('%d:%s' % (subtitles[lang][0], lang))
|
||||
res.append(('%s.idx' % lang, '%s.sub' % lang, lang, ocrlang))
|
||||
else:
|
||||
count = 1
|
||||
for track in tracks[lang]:
|
||||
params.append('%d:%s/%s-%d' % (track, destPath, lang, count))
|
||||
res.append(('%s/%s-%d.idx' % (destPath, lang,count), lang, ocrlang))
|
||||
for track in subtitles[lang]:
|
||||
params.append('%d:%s-%d' % (track, lang, count))
|
||||
res.append(('%s-%d.idx' % (lang,count), '%s-%d.sub' % (lang,count), lang, ocrlang))
|
||||
count = count+1
|
||||
|
||||
with Popen(params) as extract:
|
||||
env = {**os.environ, 'LANG': 'C'}
|
||||
with Popen(params, env=env) as extract:
|
||||
extract.wait()
|
||||
|
||||
if extract.returncode != 0:
|
||||
print("Erreur de mkvextract: %d" % extract.returncode)
|
||||
logger.error('Mkvextract returns an error code: %d' % extract.returncode)
|
||||
return None
|
||||
else:
|
||||
print("Extracted")
|
||||
logger.info('Subtitle tracks were succesfully extracted.')
|
||||
|
||||
return res
|
||||
|
||||
def doOCR(vobsubocr, idxs):
|
||||
def doOCR(vobsubocr, idxs, duration, temporaries, dumpMemFD=False):
|
||||
logger = logging.getLogger(__name__)
|
||||
res = []
|
||||
|
||||
for filename, lang, iso in idxs:
|
||||
print(filename)
|
||||
srtname = '%s.srt' % os.path.splitext(filename)[0]
|
||||
print(srtname)
|
||||
# Tesseract reconnaît la chaîne de caractères ... comme le texte 'su'
|
||||
p = re.compile('^su\n$')
|
||||
for idxName, subName, lang, iso in idxs:
|
||||
srtname = '%s.srt' % os.path.splitext(idxName)[0]
|
||||
# Tesseract seems to recognize the three dots ... as "su"
|
||||
ldots = re.compile('^su\n$')
|
||||
timestamps = re.compile('^[0-9]{2}:[0-9]{2}:[0-9]{2},[0-9]{3} \-\-> (?P<hours>[0-9]{2}):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}),[0-9]{3}$')
|
||||
|
||||
if not os.path.isfile(srtname):
|
||||
with open(srtname, 'w+') as srt:
|
||||
with Popen([vobsubocr, '--lang', iso, filename], stdout=PIPE) as ocr:
|
||||
for line in ocr.stdout:
|
||||
line = line.decode('utf8')
|
||||
m = re.match(p,line)
|
||||
if m != None:
|
||||
srt.write('...')
|
||||
else:
|
||||
srt.write(line)
|
||||
res.append((srtname, lang))
|
||||
srtfd = memfd_create(srtname, flags=0)
|
||||
with Popen([vobsubocr, '--lang', iso, idxName], stdout=PIPE) as ocr:
|
||||
pb = tqdm(TextIOWrapper(ocr.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='OCR')
|
||||
for line in pb:
|
||||
m = re.match(ldots,line)
|
||||
if m != None:
|
||||
write(srtfd, '...'.encode(encoding='UTF-8'))
|
||||
else:
|
||||
write(srtfd, line.encode(encoding='UTF-8'))
|
||||
|
||||
m = re.match(timestamps, line)
|
||||
if m!=None:
|
||||
hours = int(m.group('hours'))
|
||||
minutes = int(m.group('hours'))
|
||||
seconds = int(m.group('seconds'))
|
||||
ts = timedelta(hours=hours, minutes=minutes, seconds=seconds)
|
||||
pb.n = int(ts/timedelta(seconds=1))
|
||||
pb.update()
|
||||
|
||||
status = ocr.wait()
|
||||
|
||||
if status != 0:
|
||||
logger.error('OCR failed with status code: %d' % status)
|
||||
|
||||
if dumpMemFD:
|
||||
try:
|
||||
dumpSrt = open(srtname,'w')
|
||||
except IOError:
|
||||
logger.error('Impossible to create file: %s' % srtname)
|
||||
return None
|
||||
|
||||
lseek(srtfd, 0, SEEK_SET)
|
||||
srtLength = fstat(srtfd).st_size
|
||||
buf = read(srtfd, srtLength)
|
||||
outfd = dumpSrt.fileno()
|
||||
pos = 0
|
||||
while pos < srtLength:
|
||||
pos+=write(outfd, buf[pos:])
|
||||
|
||||
temporaries.append(dumpSrt)
|
||||
|
||||
res.append((srtfd, lang))
|
||||
|
||||
return res
|
||||
|
||||
@@ -498,6 +532,44 @@ def getFormat(ffprobe, inputFile):
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def getMovieDuration(ffprobe, inputFile):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
infd = inputFile.fileno()
|
||||
lseek(infd, 0, SEEK_SET)
|
||||
set_inheritable(infd, True)
|
||||
with Popen([ffprobe, '-loglevel', 'quiet', '-show_format', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe:
|
||||
out, _ = ffprobe.communicate()
|
||||
out = json.load(BytesIO(out))
|
||||
if 'format' in out and 'duration' in out['format']:
|
||||
duration = floor(float(out['format']['duration']))
|
||||
ts = timedelta(seconds=duration)
|
||||
return ts
|
||||
else:
|
||||
logger.error('Impossible to retrieve duration of movie')
|
||||
|
||||
return None
|
||||
|
||||
# ffprobe -loglevel quiet -select_streams v:0 -show_entries stream=width,height -of json ./talons.ts
|
||||
def getVideoDimensions(ffprobe, inputFile):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
infd = inputFile.fileno()
|
||||
lseek(infd, 0, SEEK_SET)
|
||||
set_inheritable(infd, True)
|
||||
with Popen([ffprobe, '-loglevel', 'quiet', '-select_streams', 'v:0', '-show_entries', 'stream=width,height', '-of', 'json', '-i', '/proc/self/fd/%d' % infd], stdout=PIPE, close_fds=False) as ffprobe:
|
||||
out, _ = ffprobe.communicate()
|
||||
out = json.load(BytesIO(out))
|
||||
if 'streams' in out:
|
||||
video = out['streams'][0]
|
||||
if ('width' in video) and ('height' in video):
|
||||
return int(video['width']), int(video['height'])
|
||||
|
||||
logger.error('Impossible to retrieve dimensions of video')
|
||||
exit(-1)
|
||||
|
||||
|
||||
def getStreams(ffprobe, inputFile):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -637,17 +709,16 @@ def compareTimeInterval(interval1, interval2):
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
def ffmpegConvert(ffmpeg, inputFile, inputFormat, outputFile, outputFormat, duration):
|
||||
def ffmpegConvert(ffmpeg, ffprobe, inputFile, inputFormat, outputFile, outputFormat, duration):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
width, height = getVideoDimensions(ffprobe, inputFile)
|
||||
|
||||
infd = inputFile.fileno()
|
||||
outfd = outputFile.fileno()
|
||||
set_inheritable(infd, True)
|
||||
set_inheritable(outfd, True)
|
||||
# TODO: canvas size to be fixed !
|
||||
with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '720x560', '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,
|
||||
with Popen([ffmpeg, '-y', '-loglevel', 'quiet', '-progress', '/dev/stdout', '-canvas_size', '%dx%d' % (width, height), '-f', inputFormat, '-i', '/proc/self/fd/%d' % infd,
|
||||
'-map', '0:v', '-map', '0:a', '-map', '0:s', '-bsf:v', 'h264_mp4toannexb,dump_extra=freq=keyframe', '-vcodec', 'copy', '-acodec', 'copy', '-scodec', 'dvdsub',
|
||||
'-f', outputFormat, '/proc/self/fd/%d' % outfd], stdout=PIPE, close_fds=False) as ffmpeg:
|
||||
pb = tqdm(TextIOWrapper(ffmpeg.stdout, encoding="utf-8"), total=int(duration/timedelta(seconds=1)), unit='s', desc='Conversion')
|
||||
@@ -1119,18 +1190,23 @@ def mergeMKVs(inputs, outputName):
|
||||
|
||||
return out
|
||||
|
||||
def findSubtitlesTracks(ffprobe, filename):
|
||||
# ffprobe -loglevel quiet -select_streams s -show_entries stream=index:stream_tags=language -of json corgi.ts
|
||||
def findSubtitlesTracks(ffprobe, inputFile):
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
with Popen([ffprobe, '-i', filename, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe:
|
||||
infd = inputFile.fileno()
|
||||
lseek(infd, 0, SEEK_SET)
|
||||
set_inheritable(infd, True)
|
||||
|
||||
with Popen([ffprobe, '-loglevel','quiet', '-i', '/proc/self/fd/%d' % infd, '-select_streams', 's', '-show_entries', 'stream=index:stream_tags=language', '-of', 'json'], stdout=PIPE, close_fds=False) as ffprobe:
|
||||
out, _ = ffprobe.communicate()
|
||||
out = json.load(BytesIO(out))
|
||||
if 'streams' in out:
|
||||
return out['streams']
|
||||
else:
|
||||
logger.error('Impossible to retrieve format of file')
|
||||
pass
|
||||
|
||||
ffprobe.wait()
|
||||
|
||||
|
||||
def extractSubTitleTrack(mkvmerge, inputFileName, index, lang):
|
||||
# mkvextract video.mkv tracks position:nom [position:nom]
|
||||
@@ -1228,16 +1304,16 @@ def main():
|
||||
if formatOfFile == SupportedFormat.TS:
|
||||
logger.info("Converting TS to MP4 (to fix timestamps).")
|
||||
try:
|
||||
with open(mp4filename, 'w') as mp4:
|
||||
ffmpegConvert(paths['ffmpeg'], inputFile, 'mpegts', mp4, 'mp4', duration)
|
||||
with open(mp4filename, 'w+') as mp4:
|
||||
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], inputFile, 'mpegts', mp4, 'mp4', duration)
|
||||
temporaries.append(mp4)
|
||||
logger.info("Converting MP4 to MKV.")
|
||||
try:
|
||||
mkv = open(mkvfilename, 'w')
|
||||
mkv = open(mkvfilename, 'w+')
|
||||
except IOError:
|
||||
logger.error('')
|
||||
|
||||
ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration)
|
||||
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration)
|
||||
if nbParts > 0:
|
||||
temporaries.append(mkv)
|
||||
except IOError:
|
||||
@@ -1249,7 +1325,7 @@ def main():
|
||||
mkv = open(mkvfilename, 'w')
|
||||
except IOError:
|
||||
logger.error('')
|
||||
ffmpegConvert(paths['ffmpeg'], mp4, 'mp4', mkv, 'matroska', duration)
|
||||
ffmpegConvert(paths['ffmpeg'], paths['ffprobe'], mp4, 'mp4', mkv, 'matroska', duration)
|
||||
if nbParts > 0:
|
||||
temporaries.append(mkv)
|
||||
else:
|
||||
@@ -1368,8 +1444,18 @@ def main():
|
||||
if not allOptionalTools:
|
||||
logger.warning("Missing tools for extracting subtitles.")
|
||||
else:
|
||||
logger.info("Find subtitles tracks and language.")
|
||||
subtitles = findSubtitlesTracks(args.outputFile)
|
||||
try:
|
||||
final = open(args.outputFile, mode='r')
|
||||
except IOError:
|
||||
logger.error("Impossible to open %s to finalize processing." % args.outputFile)
|
||||
exit(-1)
|
||||
|
||||
duration = getMovieDuration(paths['ffprobe'], final)
|
||||
supportedLangs = getTesseractSupportedLang(paths['tesseract'])
|
||||
logger.info('Supported lang: %s' % supportedLangs)
|
||||
logger.info('Find subtitles tracks and language.')
|
||||
subtitles = findSubtitlesTracks(paths['ffprobe'], final)
|
||||
logger.info(subtitles)
|
||||
sts = {}
|
||||
for subtitle in subtitles:
|
||||
index = subtitle['index']
|
||||
@@ -1384,20 +1470,28 @@ def main():
|
||||
logger.error("Dropping subtitle: %s because it is missing language indication")
|
||||
else:
|
||||
logger.error("Dropping subtitle: %s because it is missing language indication")
|
||||
|
||||
logger.debug(sts)
|
||||
listOfSubtitles = extractSRT(paths['mkvextract'], args.outputFile, sts, supportedLangs)
|
||||
logger.info(listOfSubtitles)
|
||||
for idxName, subName, _, _ in listOfSubtitles:
|
||||
try:
|
||||
idx = open(idxName,'r')
|
||||
except IOError:
|
||||
logger.error("Impossible to open %s." % idxName)
|
||||
exit(-1)
|
||||
try:
|
||||
sub = open(subName,'r')
|
||||
except IOError:
|
||||
logger.error("Impossible to open %s." % subName)
|
||||
exit(-1)
|
||||
|
||||
temporaries.append(idx)
|
||||
temporaries.append(sub)
|
||||
|
||||
for lang in sts:
|
||||
indexes = sts[lang]
|
||||
if len(indexes) == 0:
|
||||
# Nothing to do. This should not happen.
|
||||
continue
|
||||
if len(indexes) == 1:
|
||||
index = indexes[0]
|
||||
filename = 'essai-%s.srt' % lang
|
||||
elif len(indexes) > 1:
|
||||
nbsrt = 1
|
||||
for index in indexes:
|
||||
filename = 'essai-%s-%d.srt' % (lang, nbsrt)
|
||||
nbsrt+=1
|
||||
ocr = doOCR(paths['vobsubocr'], listOfSubtitles, duration, temporaries, args.dump)
|
||||
logger.info(ocr)
|
||||
|
||||
|
||||
if not args.keep:
|
||||
logger.info("Cleaning temporary files")
|
||||
|
||||
Reference in New Issue
Block a user