parser.py 15.9 KB
Newer Older
Robin Sonnabend's avatar
Robin Sonnabend committed
1
import regex as re
Robin Sonnabend's avatar
Robin Sonnabend committed
2
import sys
Robin Sonnabend's avatar
Robin Sonnabend committed
3
from collections import OrderedDict
Robin Sonnabend's avatar
Robin Sonnabend committed
4
from enum import Enum
Robin Sonnabend's avatar
Robin Sonnabend committed
5

6
7
from shared import escape_tex

Robin Sonnabend's avatar
Robin Sonnabend committed
8
9
import config

Robin Sonnabend's avatar
Robin Sonnabend committed
10
class ParserException(Exception):
Robin Sonnabend's avatar
Robin Sonnabend committed
11
12
13
    name = "Parser Exception"
    has_explanation = False
    #explanation = "The source did generally not match the expected protocol syntax."
Robin Sonnabend's avatar
Robin Sonnabend committed
14
15
16
17
    def __init__(self, message, linenumber=None):
        self.message = message
        self.linenumber = linenumber

Robin Sonnabend's avatar
Robin Sonnabend committed
18
19
20
21
22
23
24
25
26
27
    def __str__(self):
        result = ""
        if self.linenumber is not None:
            result = "Exception at line {}: {}".format(self.linenumber, self.message)
        else:
            result = "Exception: {}".format(self.message)
        if self.has_explanation:
            result += "\n" + self.explanation
        return result

Robin Sonnabend's avatar
Robin Sonnabend committed
28
29
30
31
32
33
34
35
class RenderType(Enum):
    latex = 0
    wikitext = 1
    plaintext = 2

def _not_implemented(self, render_type):
    return NotImplementedError("The rendertype {} has not been implemented for {}.".format(render_type.name, self.__class__.__name__))

Robin Sonnabend's avatar
Robin Sonnabend committed
36
37
38
39
40
class Element:
    """
    Generic (abstract) base element. Should never really exist.
    Template for what an element class should contain.
    """
Robin Sonnabend's avatar
Robin Sonnabend committed
41
    def render(self, render_type, show_private, level=None, protocol=None):
Robin Sonnabend's avatar
Robin Sonnabend committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
        """
        Renders the element to TeX.
        Returns:
        - a TeX-representation of the element
        """
        return "Generic Base Syntax Element, this is not supposed to appear."

    def dump(self, level=None):
        if level is None:
            level = 0
        print("{}element".format(" " * level))

    @staticmethod
    def parse(match, current, linenumber=None):
        """
        Parses a match of this elements pattern.
        Arguments:
        - match: the match of this elements pattern
        - current: the current element of the document. Should be a fork. May be modified.
        - linenumber: the current line number, for error messages
        Returns:
        - the new current element
        - the line number after parsing this element
        """
        raise ParserException("Trying to parse the generic base element!", linenumber)

    @staticmethod
    def parse_inner(match, current, linenumber=None):
        """
        Do the parsing for every element. Checks if the match exists.
        Arguments:
        - match: the match of this elements pattern
        - current = the current element of the document. Should be a fork.
        - linenumber: the current line number, for error messages
        Returns:
        - new line number
        """
        if match is None:
            raise ParserException("Source does not match!", linenumber)
        length = match.group().count("\n")
82
        return length + (0 if linenumber is None else linenumber)
Robin Sonnabend's avatar
Robin Sonnabend committed
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

    @staticmethod
    def parse_outer(element, current):
        """
        Handle the insertion of the object into the tree.
        Arguments:
        - element: the new parsed element to insert
        - current: the current element of the parsed document
        Returns:
        - the new current element
        """
        current.append(element)
        if isinstance(element, Fork):
            return element
        return current

Robin Sonnabend's avatar
Robin Sonnabend committed
99
    PATTERN = r"x(?<!x)" # yes, a master piece, but it should never be called
Robin Sonnabend's avatar
Robin Sonnabend committed
100
101

class Content(Element):
102
    def __init__(self, children, linenumber):
Robin Sonnabend's avatar
Robin Sonnabend committed
103
        self.children = children
104
        self.linenumber = linenumber
Robin Sonnabend's avatar
Robin Sonnabend committed
105

Robin Sonnabend's avatar
Robin Sonnabend committed
106
107
    def render(self, render_type, show_private, level=None, protocol=None):
        return "".join(map(lambda e: e.render(render_type, show_private, level=level, protocol=protocol), self.children))
Robin Sonnabend's avatar
Robin Sonnabend committed
108
109
110
111
112
113
114
115

    def dump(self, level=None):
        if level is None:
            level = 0
        print("{}content:".format(" " * level))
        for child in self.children:
            child.dump(level + 1)

116
117
118
119
    def get_tags(self, tags):
        tags.extend([child for child in self.children if isinstance(child, Tag)])
        return tags

Robin Sonnabend's avatar
Robin Sonnabend committed
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
    @staticmethod
    def parse(match, current, linenumber=None):
        linenumber = Element.parse_inner(match, current, linenumber)
        if match.group("content") is None:
            raise ParserException("Content is missing its content!", linenumber)
        content = match.group("content")
        element = Content.from_content(content, linenumber)
        if len(content) == 0:
            return current, linenumber
        current = Element.parse_outer(element, current)
        return current, linenumber

    @staticmethod
    def from_content(content, linenumber):
        children = []
        while len(content) > 0:
            matched = False
            for pattern in TEXT_PATTERNS:
                match = pattern.match(content)
                if match is not None:
                    matched = True
                    children.append(TEXT_PATTERNS[pattern](match, linenumber))
                    content = content[len(match.group()):]
                    break
            if not matched:
                raise ParserException("Content does not match inner!", linenumber)
146
        return Content(children, linenumber)
Robin Sonnabend's avatar
Robin Sonnabend committed
147

Robin Sonnabend's avatar
Robin Sonnabend committed
148
149
150
151
    # v1: has problems with missing semicolons
    #PATTERN = r"\s*(?<content>(?:[^\[\];]+)?(?:\[[^\]]+\][^;\[\]]*)*);"
    # v2: does not require the semicolon, but the newline
    PATTERN = r"\s*(?<content>(?:[^\[\];\r\n]+)?(?:\[[^\]\r\n]+\][^;\[\]\r\n]*)*);?"
Robin Sonnabend's avatar
Robin Sonnabend committed
152
153

class Text:
154
    def __init__(self, text, linenumber):
Robin Sonnabend's avatar
Robin Sonnabend committed
155
        self.text = text
156
        self.linenumber = linenumber
Robin Sonnabend's avatar
Robin Sonnabend committed
157

Robin Sonnabend's avatar
Robin Sonnabend committed
158
159
160
161
162
    def render(self, render_type, show_private, level=None, protocol=None):
        if render_type == RenderType.latex:
            return escape_tex(self.text)
        elif render_type == RenderType.wikitext:
            return self.text
163
        elif render_type == RenderType.plaintext:
Robin Sonnabend's avatar
Robin Sonnabend committed
164
165
166
            return self.text
        else:
            raise _not_implemented(self, render_type)
Robin Sonnabend's avatar
Robin Sonnabend committed
167
168
169
170
171
172
173
174
175
176
177
178
179

    def dump(self, level=None):
        if level is None:
            level = 0
        print("{}text: {}".format(" " * level, self.text))

    @staticmethod
    def parse(match, linenumber):
        if match is None:
            raise ParserException("Text is not actually a text!", linenumber)
        content = match.group("text")
        if content is None:
            raise ParserException("Text is empty!", linenumber)
180
        return Text(content, linenumber)
Robin Sonnabend's avatar
Robin Sonnabend committed
181
182
183
184
185

    PATTERN = r"(?<text>[^\[]+)(?:(?=\[)|$)"


class Tag:
186
    def __init__(self, name, values, linenumber):
Robin Sonnabend's avatar
Robin Sonnabend committed
187
188
        self.name = name
        self.values = values
189
        self.linenumber = linenumber
Robin Sonnabend's avatar
Robin Sonnabend committed
190

Robin Sonnabend's avatar
Robin Sonnabend committed
191
192
193
194
195
196
197
    def render(self, render_type, show_private, level=None, protocol=None):
        if render_type == RenderType.latex:
            if self.name == "url":
                return r"\url{{{}}}".format(self.values[0])
            elif self.name == "todo":
                return self.todo.render_latex(current_protocol=protocol)
            return r"\textbf{{{}:}} {}".format(escape_tex(self.name.capitalize()), escape_tex(self.values[0]))
198
        elif render_type == RenderType.plaintext:
Robin Sonnabend's avatar
Robin Sonnabend committed
199
200
            if self.name == "url":
                return self.values[0]
201
            return "{}: {}".format(self.name.capitalize(), self.values[0])
Robin Sonnabend's avatar
Robin Sonnabend committed
202
203
204
205
206
207
        elif render_type == RenderType.wikitext:
            if self.name == "url":
                return "[{0} {0}]".format(self.values[0])
            elif self.name == "todo":
                return self.todo.render_wikitext(current_protocol=protocol)
            return "'''{}:''' {}".format(self.name.capitalize(), self.values[0])
Robin Sonnabend's avatar
Robin Sonnabend committed
208
209
        else:
            raise _not_implemented(self, render_type)
Robin Sonnabend's avatar
Robin Sonnabend committed
210
211
212
213

    def dump(self, level=None):
        if level is None:
            level = 0
Robin Sonnabend's avatar
Robin Sonnabend committed
214
        print("{}tag: {}: {}".format(" " * level, self.name, "; ".join(self.values)))
Robin Sonnabend's avatar
Robin Sonnabend committed
215
216
217
218
219
220
221
222
223

    @staticmethod
    def parse(match, linenumber):
        if match is None:
            raise ParserException("Tag is not actually a tag!", linenumber)
        content = match.group("content")
        if content is None:
            raise ParserException("Tag is empty!", linenumber)
        parts = content.split(";")
224
        return Tag(parts[0], parts[1:], linenumber)
Robin Sonnabend's avatar
Robin Sonnabend committed
225
226
227
228

    PATTERN = r"\[(?<content>(?:[^;\]]*;)*(?:[^;\]]*))\]"

class Empty(Element):
229
230
    def __init__(self, linenumber):
        linenumber = linenumber
Robin Sonnabend's avatar
Robin Sonnabend committed
231

Robin Sonnabend's avatar
Robin Sonnabend committed
232
    def render(self, render_type, show_private, level=None, protocol=None):
Robin Sonnabend's avatar
Robin Sonnabend committed
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
        return ""

    def dump(self, level=None):
        if level is None:
            level = 0
        print("{}empty".format(" " * level))

    @staticmethod
    def parse(match, current, linenumber=None):
        linenumber = Element.parse_inner(match, current, linenumber)
        return current, linenumber

    PATTERN = r"\s+"

class Remark(Element):
248
    def __init__(self, name, value, linenumber):
Robin Sonnabend's avatar
Robin Sonnabend committed
249
250
        self.name = name
        self.value = value
251
        self.linenumber = linenumber
Robin Sonnabend's avatar
Robin Sonnabend committed
252

Robin Sonnabend's avatar
Robin Sonnabend committed
253
254
255
256
257
258
259
    def render(self, render_type, show_private, level=None, protocol=None):
        if render_type == RenderType.latex:
            return r"\textbf{{{}}}: {}".format(self.name, self.value)
        elif render_type == RenderType.wikitext:
            return "{}: {}".format(self.name, self.value)
        elif render_type == RenderType.plaintext:
            return "{}: {}".format(RenderType.plaintex)
Robin Sonnabend's avatar
Robin Sonnabend committed
260
261
262
263
264
265

    def dump(self, level=None):
        if level is None:
            level = 0
        print("{}remark: {}: {}".format(" " * level, self.name, self.value))

Robin Sonnabend's avatar
Robin Sonnabend committed
266
267
268
    def get_tags(self, tags):
        return tags

Robin Sonnabend's avatar
Robin Sonnabend committed
269
270
271
272
273
274
275
276
277
278
    @staticmethod
    def parse(match, current, linenumber=None):
        linenumber = Element.parse_inner(match, current, linenumber)
        if match.group("content") is None:
            raise ParserException("Remark is missing its content!", linenumber)
        content = match.group("content")
        parts = content.split(";", 1)
        if len(parts) < 2:
            raise ParserException("Remark value is empty!", linenumber)
        name, value = parts
279
        element = Remark(name, value, linenumber)
Robin Sonnabend's avatar
Robin Sonnabend committed
280
281
282
283
284
285
        current = Element.parse_outer(element, current)
        return current, linenumber

    PATTERN = r"\s*\#(?<content>[^\n]+)"

class Fork(Element):
286
    def __init__(self, environment, name, parent, linenumber, children=None):
Robin Sonnabend's avatar
Robin Sonnabend committed
287
        self.environment = environment if environment is None or len(environment) > 0 else None
288
        self.name = name.strip() if (name is not None and len(name) > 0) else None
Robin Sonnabend's avatar
Robin Sonnabend committed
289
        self.parent = parent
290
        self.linenumber = linenumber
Robin Sonnabend's avatar
Robin Sonnabend committed
291
292
293
294
295
296
297
298
299
        self.children = [] if children is None else children

    def dump(self, level=None):
        if level is None:
            level = 0
        print("{}fork: {}".format(" " * level, self.name))
        for child in self.children:
            child.dump(level + 1)

Robin Sonnabend's avatar
Robin Sonnabend committed
300
301
    def test_private(self, name):
        stripped_name = name.replace(":", "").strip()
Robin Sonnabend's avatar
Robin Sonnabend committed
302
        return stripped_name in config.PRIVATE_KEYWORDS
Robin Sonnabend's avatar
Robin Sonnabend committed
303

Robin Sonnabend's avatar
Robin Sonnabend committed
304
    def render(self, render_type, show_private, level, protocol=None):
Robin Sonnabend's avatar
Robin Sonnabend committed
305
        name_line = self.name if self.name is not None and len(self.name) > 0 else ""
Robin Sonnabend's avatar
Robin Sonnabend committed
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
        if render_type == RenderType.latex:
            begin_line = r"\begin{itemize}"
            end_line = r"\end{itemize}"
            content_parts = []
            for child in self.children:
                part = child.render(render_type, show_private, level=level+1, protocol=protocol)
                if len(part.strip()) == 0:
                    continue
                if not part.startswith(r"\item"):
                    part = r"\item {}".format(part)
                content_parts.append(part)
            content_lines = "\n".join(content_parts)
            if level == 0:
                return "\n".join([begin_line, content_lines, end_line])
            elif self.test_private(self.name):
                if show_private:
                    return content_lines
                else:
                    return ""
Robin Sonnabend's avatar
Robin Sonnabend committed
325
            else:
Robin Sonnabend's avatar
Robin Sonnabend committed
326
327
                return "\n".join([name_line, begin_line, content_lines, end_line])
        elif render_type == RenderType.wikitext:
Robin Sonnabend's avatar
Robin Sonnabend committed
328
            title_line = "{0} {1} {0}".format("=" * (level + 2), name_line)
Robin Sonnabend's avatar
Robin Sonnabend committed
329
330
331
332
333
334
            content_parts = []
            for child in self.children:
                part = child.render(render_type, show_private, level=level+1, protocol=protocol)
                if len(part.strip()) == 0:
                    continue
                content_parts.append(part)
Robin Sonnabend's avatar
Robin Sonnabend committed
335
            content_lines = "{}\n\n{}\n".format(title_line, "\n\n".join(content_parts))
Robin Sonnabend's avatar
Robin Sonnabend committed
336
            if self.test_private(self.name) and not show_private:
Robin Sonnabend's avatar
Robin Sonnabend committed
337
                return ""
Robin Sonnabend's avatar
Robin Sonnabend committed
338
339
340
341
342
343
            else:
                return content_lines
        elif render_type == RenderType.plaintext:
            title_line = "{} {}".format("#" * (level + 1), name_line)
            content_parts = []
            for child in self.children:
344
                part = child.render(render_type, show_private, level=level+1, protocol=protocol)
Robin Sonnabend's avatar
Robin Sonnabend committed
345
346
347
348
349
350
351
352
                if len(part.strip()) == 0:
                    continue
                content_parts.append(part)
            content_lines = "{}\n{}".format(title_line, "\n".join(content_parts))
            if self.test_private(self.name) and not show_private:
                return ""
            else:
                return content_lines
Robin Sonnabend's avatar
Robin Sonnabend committed
353
        else:
Robin Sonnabend's avatar
Robin Sonnabend committed
354
355
            raise _not_implemented(self, render_type)

Robin Sonnabend's avatar
Robin Sonnabend committed
356

357
358
359
360
361
362
363
    def get_tags(self, tags=None):
        if tags is None:
            tags = []
        for child in self.children:
            child.get_tags(tags)
        return tags

Robin Sonnabend's avatar
Robin Sonnabend committed
364
365
366
367
368
369
370
371
    def is_anonymous(self):
        return self.environment == None

    def is_root(self):
        return self.parent is None

    @staticmethod
    def create_root():
372
        return Fork(None, None, None, 0)
Robin Sonnabend's avatar
Robin Sonnabend committed
373
374
375
376
377
378
379
380
381
382
383
384

    @staticmethod
    def parse(match, current, linenumber=None):
        linenumber = Element.parse_inner(match, current, linenumber)
        environment = match.group("environment")
        name1 = match.group("name1")
        name2 = match.group("name2")
        name = ""
        if name1 is not None:
            name = name1
        if name2 is not None:
            name += " {}".format(name2)
385
        element = Fork(environment, name, current, linenumber)
Robin Sonnabend's avatar
Robin Sonnabend committed
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
        current = Element.parse_outer(element, current)
        return current, linenumber

    @staticmethod
    def parse_end(match, current, linenumber=None):
        linenumber = Element.parse_inner(match, current, linenumber)
        if current.is_root():
            raise ParserException("Found end tag for root element!", linenumber)
        current = current.parent
        return current, linenumber

    def append(self, element):
        self.children.append(element)

    PATTERN = r"\s*(?<name1>[^{};]+)?{(?<environment>\S+)?\h*(?<name2>[^\n]+)?"
    END_PATTERN = r"\s*};?"

PATTERNS = OrderedDict([
    (re.compile(Fork.PATTERN), Fork.parse),
    (re.compile(Fork.END_PATTERN), Fork.parse_end),
    (re.compile(Remark.PATTERN), Remark.parse),
    (re.compile(Content.PATTERN), Content.parse),
    (re.compile(Empty.PATTERN), Empty.parse)
])

TEXT_PATTERNS = OrderedDict([
    (re.compile(Text.PATTERN), Text.parse),
    (re.compile(Tag.PATTERN), Tag.parse)
])

def parse(source):
    linenumber = 1
    tree = Fork.create_root()
    current = tree
    while len(source) > 0:
        found = False
        for pattern in PATTERNS:
            match = pattern.match(source)
            if match is not None:
                source = source[len(match.group()):]
                current, linenumber = PATTERNS[pattern](match, current, linenumber)
                found = True
                break
        if not found:
            raise ParserException("No matching syntax element found!", linenumber)
Robin Sonnabend's avatar
Robin Sonnabend committed
431
    if current is not tree:
432
        raise ParserException("Source ended within fork! (started at line {})".format(current.linenumber))
Robin Sonnabend's avatar
Robin Sonnabend committed
433
434
    return tree

Robin Sonnabend's avatar
Robin Sonnabend committed
435
def main(test_file_name=None):
Robin Sonnabend's avatar
Robin Sonnabend committed
436
    source = ""
Robin Sonnabend's avatar
Robin Sonnabend committed
437
438
    test_file_name = test_file_name or "source0"
    with open("test/{}.txt".format(test_file_name)) as f:
Robin Sonnabend's avatar
Robin Sonnabend committed
439
        source = f.read()
Robin Sonnabend's avatar
Robin Sonnabend committed
440
441
442
443
444
445
446
    try:
        tree = parse(source)
        tree.dump()
    except ParserException as e:
        print(e)
    else:
        print("worked!")
Robin Sonnabend's avatar
Robin Sonnabend committed
447
448
449
    

if __name__ == "__main__":
Robin Sonnabend's avatar
Robin Sonnabend committed
450
451
    test_file_name = sys.argv[1] if len(sys.argv) > 1 else None
    exit(main(test_file_name))