parser.py - mozsearch

mozilla-central/testing/web-platform/tests/webvtt/parsing/file-parsing/tools/parser.py

Enable keyboard shortcuts

Source code

File a bug in Core :: Audio/Video: Playback

Revision control

Copy as Markdown

Other Tools

"""

A direct translation of the webvtt file parsing algorithm.

See https://w3c.github.io/webvtt/#file-parsing for documentation

"""

import re

import string

SPACE_CHARACTERS = [' ', '\t', '\n', '\f', '\r']

SPACE_SPLIT_PATTERN = r"[{}]*".format(''.join(SPACE_CHARACTERS))

DIGITS = string.digits

class DictInit:

    def __init__(self, **dict):

        self.__dict__.update(dict)

class VTTCue(DictInit): pass

class VTTRegion(DictInit): pass

class Stylesheet(DictInit): pass

class W3CParser:

    input = None

    position = None

    def collect_characters(self, condition):

        result = ""

        while self.position < len(self.input) and condition(self.input[self.position]):

            result += self.input[self.position]

            self.position += 1

        return result

    def skip_whitespace(self):

        self.collect_characters(lambda c: c in SPACE_CHARACTERS)

    def parse_percentage_string(self, input):

        'parse a percentage string'

        # 1.

        input = input

        # 2.

        if not re.match(r'^\d+(\.\d+)?%$', input):

            return None

        # 3.

        percentage = float(input[:-1])

        # 4.

        if percentage < 0 or percentage > 100:

            return None

        # 5.

        return percentage

class VTTParser(W3CParser):

    def __init__(self, input):

        self.input = input

        self.position = 0

        self.seen_cue = False

        self.text_tracks = []

        self.stylesheets = []

        self.regions = []

        self.errors = []

    def parse(self):

        'WebVTT parser algorithm'

        # 1.

        self.input = self.input.replace('\0', '\ufffd').replace('\r\n', '\n').replace('\r', '\n')

        # 2.

        self.position = 0

        # 3.

        self.seen_cue = False

        # 4.

        if len(self.input) < 6:

            self.errors.append('input too small for webvtt')

            return

        # 5.

        if len(self.input) == 6 and self.input != 'WEBVTT':

            self.errors.append('invalid webvtt header')

            return

        # 6.

        if len(self.input) > 6:

            if not (self.input[0:6] == 'WEBVTT' and self.input[6] in ['\u0020', '\u0009', '\u000A']):

                self.errors.append('invalid webvtt header')

                return

        # 7.

        self.collect_characters(lambda c: c != '\n')

        # 8.

        if self.position >= len(self.input):

            return

        # 9.

        if self.input[self.position] == '\n':

            self.position += 1

        # 10.

        if self.position >= len(self.input):

            return

        # 11.

        if self.input[self.position] != '\n':

            self.collect_block(in_header = True)

        else:

            self.position += 1

        # 12.

        self.collect_characters(lambda c: c == '\n')

        # 13.

        self.regions = []

        # 14.

        while self.position < len(self.input):

            # 1.

            block = self.collect_block()

            # 2.

            if isinstance(block, VTTCue):

                self.text_tracks.append(block)

            # 3.

            elif isinstance(block, Stylesheet):

                self.stylesheets.append(block)

            # 4.

            elif isinstance(block, VTTRegion):

                self.regions.append(block)

            # 5.

            self.collect_characters(lambda c: c == '\n')

        # 15.

        return

    def collect_block(self, in_header = False):

        'collect a WebVTT block'

        # 1. (done by class)

        line_count = 0                    # 2.

        previous_position = self.position # 3.

        line = ""                         # 4.

        buffer = ""                       # 5.

        seen_eof = False                  # 6.

        seen_arrow = False                # 7.

        cue = None                        # 8.

        stylesheet = None                 # 9.

        region = None                     # 10.

        # 11.

        while True:

            # 1.

            line = self.collect_characters(lambda c: c != '\n')

            # 2.

            line_count += 1

            # 3.

            if self.position >= len(self.input):

                seen_eof = True

            else:

                self.position += 1

            # 4.

            if '-->' in line:

                # 1.

                if not in_header and (line_count == 1 or line_count == 2 and not seen_arrow):

                    # 1.

                    seen_arrow = True

                    # 2.

                    previous_position = self.position

                    # 3.

                    cue = VTTCue(

                        id = buffer,

                        pause_on_exit = False,

                        region = None,

                        writing_direction = 'horizontal',

                        snap_to_lines = True,

                        line = 'auto',

                        line_alignment = 'start alignment',

                        position = 'auto',

                        position_alignment = 'auto',

                        cue_size = 100,

                        text_alignment = 'center',

                        text = '',

                    # 4.

                    if not VTTCueParser(self, line, cue).collect_cue_timings_and_settings():

                        cue = None

                    else:

                        buffer = ''

                        self.seen_cue = True # DIFFERENCE

                else:

                    self.errors.append('invalid webvtt cue block')

                    self.position = previous_position

                    break

            # 5.

            elif line == '':

                break

            # 6.

            else:

                # 1.

                if not in_header and line_count == 2:

                    # 1.

                    if not self.seen_cue and re.match(r'^STYLE\s*$', buffer):

                        stylesheet = Stylesheet(

                            location = None,

                            parent = None,

                            owner_node = None,

                            owner_rule = None,

                            media = None,

                            title = None,

                            alternate = False,

                            origin_clean = True,

                            source = None,

                        buffer = ''

                    # 2.

                    elif not self.seen_cue and re.match(r'^REGION\s*$', buffer):

                        region = VTTRegion(

                            id = '',

                            width = 100,

                            lines = 3,

                            anchor_point = (0, 100),

                            viewport_anchor_point = (0, 100),

                            scroll_value = None,

                        buffer = ''

                # 2.

                if buffer != '':

                    buffer += '\n'

                # 3.

                buffer += line

                # 4.

                previous_position = self.position

            # 7.

            if seen_eof:

                break

        # 12.

        if cue is not None:

            cue.text = buffer

            return cue

        # 13.

        elif stylesheet is not None:

            stylesheet.source = buffer

            return stylesheet

        # 14.

        elif region is not None:

            self.collect_region_settings(region, buffer)

            return region

        # 15.

        return None

    def collect_region_settings(self, region, input):

        'collect WebVTT region settings'

        # 1.

        settings = re.split(SPACE_SPLIT_PATTERN, input)

        # 2.

        for setting in settings:

            # 1.

            if ':' not in setting:

                continue

            index = setting.index(':')

            if index in [0, len(setting) - 1]:

                continue

            # 2.

            name = setting[:index]

            # 3.

            value = setting[index + 1:]

            # 4.

            if name == "id":

                region.id = value

            elif name == "width":

                percentage = self.parse_percentage_string(value)

                if percentage is not None:

                    region.width = percentage

            elif name == "lines":

                # 1.

                if not re.match(r'^\d+$', value):

                    continue

                # 2.

                number = int(value)

                # 3.

                region.lines = number

            elif name == "regionanchor":

                # 1.

                if ',' not in value:

                    continue

                #. 2.

                index = value.index(',')

                anchorX = value[:index]

                # 3.

                anchorY = value[index + 1:]

                # 4.

                percentageX = self.parse_percentage_string(anchorX)

                percentageY = self.parse_percentage_string(anchorY)

                if None in [percentageX, percentageY]:

                    continue

                # 5.

                region.anchor_point = (percentageX, percentageY)

            elif name == "viewportanchor":

                # 1.

                if ',' not in value:

                    continue

                #. 2.

                index = value.index(',')

                viewportanchorX = value[:index]

                # 3.

                viewportanchorY = value[index + 1:]

                # 4.

                percentageX = self.parse_percentage_string(viewportanchorX)

                percentageY = self.parse_percentage_string(viewportanchorY)

                if None in [percentageX, percentageY]:

                    continue

                # 5.

                region.viewport_anchor_point = (percentageX, percentageY)

            elif name == "scroll":

                # 1.

                if value == "up":

                    region.scroll_value = "up"

            # 5.

            continue

class VTTCueParser(W3CParser):

    def __init__(self, parent, input, cue):

        self.parent = parent

        self.errors = self.parent.errors

        self.input = input

        self.position = 0

        self.cue = cue

    def collect_cue_timings_and_settings(self):

        'collect WebVTT cue timings and settings'

        # 1. (handled by class)

        # 2.

        self.position = 0

        # 3.

        self.skip_whitespace()

        # 4.

        timestamp = self.collect_timestamp()

        if timestamp is None:

            self.errors.append('invalid start time for VTTCue')

            return False

        self.cue.start_time = timestamp

        # 5.

        self.skip_whitespace()

        # 6.

        if self.input[self.position] != '-':

            return False

        self.position += 1

        # 7.

        if self.input[self.position] != '-':

            return False

        self.position += 1

        # 8.

        if self.input[self.position] != '>':

            return False

        self.position += 1

        # 9.

        self.skip_whitespace()

        # 10.

        timestamp = self.collect_timestamp()

        if timestamp is None:

            self.errors.append('invalid end time for VTTCue')

            return False

        self.cue.end_time = timestamp

        # 11.

        remainder = self.input[self.position:]

        # 12.

        self.parse_settings(remainder)

        # Extra

        return True

    def parse_settings(self, input):

        'parse the WebVTT cue settings'

        # 1.

        settings = re.split(SPACE_SPLIT_PATTERN, input)

        # 2.

        for setting in settings:

            # 1.

            if ':' not in setting:

                continue

            index = setting.index(':')

            if index in [0, len(setting) - 1]:

                continue

            # 2.

            name = setting[:index]

            # 3.

            value = setting[index + 1:]

            # 4.

            if name == 'region':

                # 1.

                last_regions = (region for region in reversed(self.parent.regions) if region.id == value)

                self.cue.region = next(last_regions, None)

            elif name == 'vertical':

                # 1. and 2.

                if value in ['rl', 'lr']:

                    self.cue.writing_direction = value

            elif name == 'line':

                # 1.

                if ',' in value:

                    index = value.index(',')

                    linepos = value[:index]

                    linealign = value[index + 1:]

                # 2.

                else:

                    linepos = value

                    linealign = None

                # 3.

                if not re.search(r'\d', linepos):

                    continue

                # 4.

                if linepos[-1] == '%':

                    number = self.parse_percentage_string(linepos)

                    if number is None:

                        continue

                else:

                    # 1.

                    if not re.match(r'^[-\.\d]*$', linepos):

                        continue

                    # 2.

                    if '-' in linepos[1:]:

                        continue

                    # 3.

                    if linepos.count('.') > 1:

                        continue

                    # 4.

                    if '.' in linepos:

                        if not re.search(r'\d\.\d', linepos):

                            continue

                    # 5.

                    number = float(linepos)

                # 5.

                if linealign == "start":

                    self.cue.line_alignment = 'start'

                # 6.

                elif linealign == "center":

                    self.cue.line_alignment = 'center'

                # 7.

                elif linealign == "end":

                    self.cue.line_alignment = 'end'

                # 8.

                elif linealign != None:

                    continue

                # 9.

                self.cue.line = number

                # 10.

                if linepos[-1] == '%':

                    self.cue.snap_to_lines = False

                else:

                    self.cue.snap_to_lines = True

            elif name == 'position':

                # 1.

                if ',' in value:

                    index = value.index(',')

                    colpos = value[:index]

                    colalign = value[index + 1:]

                # 2.

                else:

                    colpos = value

                    colalign = None

                # 3.

                number = self.parse_percentage_string(colpos)

                if number is None:

                    continue

                # 4.

                if colalign == "line-left":

                    self.cue.line_alignment = 'line-left'

                # 5.

                elif colalign == "center":

                    self.cue.line_alignment = 'center'

                # 6.

                elif colalign == "line-right":

                    self.cue.line_alignment = 'line-right'

                # 7.

                elif colalign != None:

                    continue

                # 8.

                self.cue.position = number

            elif name == 'size':

                # 1.

                number = self.parse_percentage_string(value)

                if number is None:

                    continue

                # 2.

                self.cue.cue_size = number

            elif name == 'align':

                # 1.

                if value == 'start':

                    self.cue.text_alignment = 'start'

                # 2.

                if value == 'center':

                    self.cue.text_alignment = 'center'

                # 3.

                if value == 'end':

                    self.cue.text_alignment = 'end'

                # 4.

                if value == 'left':

                    self.cue.text_alignment = 'left'

                # 5.

                if value == 'right':

                    self.cue.text_alignment = 'right'

            # 5.

            continue

    def collect_timestamp(self):

        'collect a WebVTT timestamp'

        # 1. (handled by class)

        # 2.

        most_significant_units = 'minutes'

        # 3.

        if self.position >= len(self.input):

            return None

        # 4.

        if self.input[self.position] not in DIGITS:

            return None

        # 5.

        string = self.collect_characters(lambda c: c in DIGITS)

        # 6.

        value_1 = int(string)

        # 7.

        if len(string) != 2 or value_1 > 59:

            most_significant_units = 'hours'

        # 8.

        if self.position >= len(self.input) or self.input[self.position] != ':':

            return None

        self.position += 1

        # 9.

        string = self.collect_characters(lambda c: c in DIGITS)

        # 10.

        if len(string) != 2:

            return None

        # 11.

        value_2 = int(string)

        # 12.

        if most_significant_units == 'hours' or self.position < len(self.input) and self.input[self.position] == ':':

            # 1.

            if self.position >= len(self.input) or self.input[self.position] != ':':

                return None

            self.position += 1

            # 2.

            string = self.collect_characters(lambda c: c in DIGITS)

            # 3.

            if len(string) != 2:

                return None

            # 4.

            value_3 = int(string)

        else:

            value_3 = value_2

            value_2 = value_1

            value_1 = 0

        # 13.

        if self.position >= len(self.input) or self.input[self.position] != '.':

            return None

        self.position += 1

        # 14.

        string = self.collect_characters(lambda c: c in DIGITS)

        # 15.

        if len(string) != 3:

            return None

        # 16.

        value_4 = int(string)

        # 17.

        if value_2 >= 59 or value_3 >= 59:

            return None

        # 18.

        result = value_1 * 60 * 60 + value_2 * 60 + value_3 + value_4 / 1000

        # 19.

        return result

def main(argv):

    files = [open(path, 'r') for path in argv[1:]]

    try:

        for file in files:

            parser = VTTParser(file.read())

            parser.parse()

            print("Results: {}".format(file))

            print("  Cues: {}".format(parser.text_tracks))

            print("  StyleSheets: {}".format(parser.stylesheets))

            print("  Regions: {}".format(parser.regions))

            print("  Errors: {}".format(parser.errors))

    finally:

        for file in files:

            file.close()

if __name__ == '__main__':

    import sys

    main(sys.argv);