Source code

Revision control

Copy as Markdown

Other Tools

"""
A direct translation of the webvtt file parsing algorithm.
"""
import re
import string
SPACE_CHARACTERS = [' ', '\t', '\n', '\f', '\r']
SPACE_SPLIT_PATTERN = r"[{}]*".format(''.join(SPACE_CHARACTERS))
DIGITS = string.digits
class DictInit:
def __init__(self, **dict):
self.__dict__.update(dict)
class VTTCue(DictInit): pass
class VTTRegion(DictInit): pass
class Stylesheet(DictInit): pass
class W3CParser:
input = None
position = None
def collect_characters(self, condition):
result = ""
while self.position < len(self.input) and condition(self.input[self.position]):
result += self.input[self.position]
self.position += 1
return result
def skip_whitespace(self):
self.collect_characters(lambda c: c in SPACE_CHARACTERS)
def parse_percentage_string(self, input):
'parse a percentage string'
# 1.
input = input
# 2.
if not re.match(r'^\d+(\.\d+)?%$', input):
return None
# 3.
percentage = float(input[:-1])
# 4.
if percentage < 0 or percentage > 100:
return None
# 5.
return percentage
class VTTParser(W3CParser):
def __init__(self, input):
self.input = input
self.position = 0
self.seen_cue = False
self.text_tracks = []
self.stylesheets = []
self.regions = []
self.errors = []
def parse(self):
'WebVTT parser algorithm'
# 1.
self.input = self.input.replace('\0', '\ufffd').replace('\r\n', '\n').replace('\r', '\n')
# 2.
self.position = 0
# 3.
self.seen_cue = False
# 4.
if len(self.input) < 6:
self.errors.append('input too small for webvtt')
return
# 5.
if len(self.input) == 6 and self.input != 'WEBVTT':
self.errors.append('invalid webvtt header')
return
# 6.
if len(self.input) > 6:
if not (self.input[0:6] == 'WEBVTT' and self.input[6] in ['\u0020', '\u0009', '\u000A']):
self.errors.append('invalid webvtt header')
return
# 7.
self.collect_characters(lambda c: c != '\n')
# 8.
if self.position >= len(self.input):
return
# 9.
if self.input[self.position] == '\n':
self.position += 1
# 10.
if self.position >= len(self.input):
return
# 11.
if self.input[self.position] != '\n':
self.collect_block(in_header = True)
else:
self.position += 1
# 12.
self.collect_characters(lambda c: c == '\n')
# 13.
self.regions = []
# 14.
while self.position < len(self.input):
# 1.
block = self.collect_block()
# 2.
if isinstance(block, VTTCue):
self.text_tracks.append(block)
# 3.
elif isinstance(block, Stylesheet):
self.stylesheets.append(block)
# 4.
elif isinstance(block, VTTRegion):
self.regions.append(block)
# 5.
self.collect_characters(lambda c: c == '\n')
# 15.
return
def collect_block(self, in_header = False):
'collect a WebVTT block'
# 1. (done by class)
line_count = 0 # 2.
previous_position = self.position # 3.
line = "" # 4.
buffer = "" # 5.
seen_eof = False # 6.
seen_arrow = False # 7.
cue = None # 8.
stylesheet = None # 9.
region = None # 10.
# 11.
while True:
# 1.
line = self.collect_characters(lambda c: c != '\n')
# 2.
line_count += 1
# 3.
if self.position >= len(self.input):
seen_eof = True
else:
self.position += 1
# 4.
if '-->' in line:
# 1.
if not in_header and (line_count == 1 or line_count == 2 and not seen_arrow):
# 1.
seen_arrow = True
# 2.
previous_position = self.position
# 3.
cue = VTTCue(
id = buffer,
pause_on_exit = False,
region = None,
writing_direction = 'horizontal',
snap_to_lines = True,
line = 'auto',
line_alignment = 'start alignment',
position = 'auto',
position_alignment = 'auto',
cue_size = 100,
text_alignment = 'center',
text = '',
)
# 4.
if not VTTCueParser(self, line, cue).collect_cue_timings_and_settings():
cue = None
else:
buffer = ''
self.seen_cue = True # DIFFERENCE
else:
self.errors.append('invalid webvtt cue block')
self.position = previous_position
break
# 5.
elif line == '':
break
# 6.
else:
# 1.
if not in_header and line_count == 2:
# 1.
if not self.seen_cue and re.match(r'^STYLE\s*$', buffer):
stylesheet = Stylesheet(
location = None,
parent = None,
owner_node = None,
owner_rule = None,
media = None,
title = None,
alternate = False,
origin_clean = True,
source = None,
)
buffer = ''
# 2.
elif not self.seen_cue and re.match(r'^REGION\s*$', buffer):
region = VTTRegion(
id = '',
width = 100,
lines = 3,
anchor_point = (0, 100),
viewport_anchor_point = (0, 100),
scroll_value = None,
)
buffer = ''
# 2.
if buffer != '':
buffer += '\n'
# 3.
buffer += line
# 4.
previous_position = self.position
# 7.
if seen_eof:
break
# 12.
if cue is not None:
cue.text = buffer
return cue
# 13.
elif stylesheet is not None:
stylesheet.source = buffer
return stylesheet
# 14.
elif region is not None:
self.collect_region_settings(region, buffer)
return region
# 15.
return None
def collect_region_settings(self, region, input):
'collect WebVTT region settings'
# 1.
settings = re.split(SPACE_SPLIT_PATTERN, input)
# 2.
for setting in settings:
# 1.
if ':' not in setting:
continue
index = setting.index(':')
if index in [0, len(setting) - 1]:
continue
# 2.
name = setting[:index]
# 3.
value = setting[index + 1:]
# 4.
if name == "id":
region.id = value
elif name == "width":
percentage = self.parse_percentage_string(value)
if percentage is not None:
region.width = percentage
elif name == "lines":
# 1.
if not re.match(r'^\d+$', value):
continue
# 2.
number = int(value)
# 3.
region.lines = number
elif name == "regionanchor":
# 1.
if ',' not in value:
continue
#. 2.
index = value.index(',')
anchorX = value[:index]
# 3.
anchorY = value[index + 1:]
# 4.
percentageX = self.parse_percentage_string(anchorX)
percentageY = self.parse_percentage_string(anchorY)
if None in [percentageX, percentageY]:
continue
# 5.
region.anchor_point = (percentageX, percentageY)
elif name == "viewportanchor":
# 1.
if ',' not in value:
continue
#. 2.
index = value.index(',')
viewportanchorX = value[:index]
# 3.
viewportanchorY = value[index + 1:]
# 4.
percentageX = self.parse_percentage_string(viewportanchorX)
percentageY = self.parse_percentage_string(viewportanchorY)
if None in [percentageX, percentageY]:
continue
# 5.
region.viewport_anchor_point = (percentageX, percentageY)
elif name == "scroll":
# 1.
if value == "up":
region.scroll_value = "up"
# 5.
continue
class VTTCueParser(W3CParser):
def __init__(self, parent, input, cue):
self.parent = parent
self.errors = self.parent.errors
self.input = input
self.position = 0
self.cue = cue
def collect_cue_timings_and_settings(self):
'collect WebVTT cue timings and settings'
# 1. (handled by class)
# 2.
self.position = 0
# 3.
self.skip_whitespace()
# 4.
timestamp = self.collect_timestamp()
if timestamp is None:
self.errors.append('invalid start time for VTTCue')
return False
self.cue.start_time = timestamp
# 5.
self.skip_whitespace()
# 6.
if self.input[self.position] != '-':
return False
self.position += 1
# 7.
if self.input[self.position] != '-':
return False
self.position += 1
# 8.
if self.input[self.position] != '>':
return False
self.position += 1
# 9.
self.skip_whitespace()
# 10.
timestamp = self.collect_timestamp()
if timestamp is None:
self.errors.append('invalid end time for VTTCue')
return False
self.cue.end_time = timestamp
# 11.
remainder = self.input[self.position:]
# 12.
self.parse_settings(remainder)
# Extra
return True
def parse_settings(self, input):
'parse the WebVTT cue settings'
# 1.
settings = re.split(SPACE_SPLIT_PATTERN, input)
# 2.
for setting in settings:
# 1.
if ':' not in setting:
continue
index = setting.index(':')
if index in [0, len(setting) - 1]:
continue
# 2.
name = setting[:index]
# 3.
value = setting[index + 1:]
# 4.
if name == 'region':
# 1.
last_regions = (region for region in reversed(self.parent.regions) if region.id == value)
self.cue.region = next(last_regions, None)
elif name == 'vertical':
# 1. and 2.
if value in ['rl', 'lr']:
self.cue.writing_direction = value
elif name == 'line':
# 1.
if ',' in value:
index = value.index(',')
linepos = value[:index]
linealign = value[index + 1:]
# 2.
else:
linepos = value
linealign = None
# 3.
if not re.search(r'\d', linepos):
continue
# 4.
if linepos[-1] == '%':
number = self.parse_percentage_string(linepos)
if number is None:
continue
else:
# 1.
if not re.match(r'^[-\.\d]*$', linepos):
continue
# 2.
if '-' in linepos[1:]:
continue
# 3.
if linepos.count('.') > 1:
continue
# 4.
if '.' in linepos:
if not re.search(r'\d\.\d', linepos):
continue
# 5.
number = float(linepos)
# 5.
if linealign == "start":
self.cue.line_alignment = 'start'
# 6.
elif linealign == "center":
self.cue.line_alignment = 'center'
# 7.
elif linealign == "end":
self.cue.line_alignment = 'end'
# 8.
elif linealign != None:
continue
# 9.
self.cue.line = number
# 10.
if linepos[-1] == '%':
self.cue.snap_to_lines = False
else:
self.cue.snap_to_lines = True
elif name == 'position':
# 1.
if ',' in value:
index = value.index(',')
colpos = value[:index]
colalign = value[index + 1:]
# 2.
else:
colpos = value
colalign = None
# 3.
number = self.parse_percentage_string(colpos)
if number is None:
continue
# 4.
if colalign == "line-left":
self.cue.line_alignment = 'line-left'
# 5.
elif colalign == "center":
self.cue.line_alignment = 'center'
# 6.
elif colalign == "line-right":
self.cue.line_alignment = 'line-right'
# 7.
elif colalign != None:
continue
# 8.
self.cue.position = number
elif name == 'size':
# 1.
number = self.parse_percentage_string(value)
if number is None:
continue
# 2.
self.cue.cue_size = number
elif name == 'align':
# 1.
if value == 'start':
self.cue.text_alignment = 'start'
# 2.
if value == 'center':
self.cue.text_alignment = 'center'
# 3.
if value == 'end':
self.cue.text_alignment = 'end'
# 4.
if value == 'left':
self.cue.text_alignment = 'left'
# 5.
if value == 'right':
self.cue.text_alignment = 'right'
# 5.
continue
def collect_timestamp(self):
'collect a WebVTT timestamp'
# 1. (handled by class)
# 2.
most_significant_units = 'minutes'
# 3.
if self.position >= len(self.input):
return None
# 4.
if self.input[self.position] not in DIGITS:
return None
# 5.
string = self.collect_characters(lambda c: c in DIGITS)
# 6.
value_1 = int(string)
# 7.
if len(string) != 2 or value_1 > 59:
most_significant_units = 'hours'
# 8.
if self.position >= len(self.input) or self.input[self.position] != ':':
return None
self.position += 1
# 9.
string = self.collect_characters(lambda c: c in DIGITS)
# 10.
if len(string) != 2:
return None
# 11.
value_2 = int(string)
# 12.
if most_significant_units == 'hours' or self.position < len(self.input) and self.input[self.position] == ':':
# 1.
if self.position >= len(self.input) or self.input[self.position] != ':':
return None
self.position += 1
# 2.
string = self.collect_characters(lambda c: c in DIGITS)
# 3.
if len(string) != 2:
return None
# 4.
value_3 = int(string)
else:
value_3 = value_2
value_2 = value_1
value_1 = 0
# 13.
if self.position >= len(self.input) or self.input[self.position] != '.':
return None
self.position += 1
# 14.
string = self.collect_characters(lambda c: c in DIGITS)
# 15.
if len(string) != 3:
return None
# 16.
value_4 = int(string)
# 17.
if value_2 >= 59 or value_3 >= 59:
return None
# 18.
result = value_1 * 60 * 60 + value_2 * 60 + value_3 + value_4 / 1000
# 19.
return result
def main(argv):
files = [open(path, 'r') for path in argv[1:]]
try:
for file in files:
parser = VTTParser(file.read())
parser.parse()
print("Results: {}".format(file))
print(" Cues: {}".format(parser.text_tracks))
print(" StyleSheets: {}".format(parser.stylesheets))
print(" Regions: {}".format(parser.regions))
print(" Errors: {}".format(parser.errors))
finally:
for file in files:
file.close()
if __name__ == '__main__':
import sys
main(sys.argv);