#!/usr/bin/env python3 # Copyright (c) 2011 Qtrac Ltd. All rights reserved. # This program or module is free software: you can redistribute it and/or # modify it under the terms of the GNU General Public License as published # by the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. It is provided for educational # purposes and is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. """ BNF BIBTEXES ::= BIBTEX+ BIBTEX ::= '@Book{' IDENTIFIER ',' KEY_VALUES '}' IDENTIFIER ::= [a-zA-Z][^,\s]* KEY_VALUES ::= KEY_VALUE | KEY_VALUE ',' KEY_VALUES KEY_VALUE ::= KEY '=' VALUE KEY ::= [a-zA-Z]\w* VALUE ::= "[^"]+" | \d+ """ import pprint import ply.lex try: from pyparsing import (alphas, alphanums, delimitedList, nums, OneOrMore, ParseException, QuotedString, Regex, Suppress, Word) except ImportError: from pyparsing_py3 import (alphas, alphanums, delimitedList, nums, OneOrMore, ParseException, QuotedString, Regex, Suppress, Word) import re # The examples are copied from Wikipedia TEXT = """ @Book{blanchette+summerfield, author = "Jasmin Blanchette and Mark Summerfield", title = "C++ GUI Programming with Qt 4, Second Edition", publisher = "Prentice Hall", year = 2008, address = "New York" } @Book{abramowitz+stegun, author = "Milton {Abramowitz} and Irene A. {Stegun}", title = "Handbook of Mathematical Functions with Formulas, Graphs, and Mathematical Tables", publisher = "Dover", year = 1964, address = "New York", edition = "ninth Dover printing, tenth GPO printing" } @Book{hicks2001, author = "von Hicks, III, Michael", title = "Design of a Carbon Fiber Composite Grid Structure for the GLAST Spacecraft Using a Novel Manufacturing Technique", publisher = "Stanford Press", year = 2001, address = "Palo Alto", edition = "1st,", isbn = "0-69-697269-4" } @Book{Torre2008, author = "Joe Torre and Tom Verducci", publisher = "Doubleday", title = "The Yankee Years", year = 2008, isbn = "0385527403" } """ def pyparsing_parse(text): WHITESPACE = re.compile(r"\s+") books = {} key_values = {} def normalize(tokens): return WHITESPACE.sub(" ", tokens[0]) def add_key_value(tokens): key_values[tokens.key] = tokens.value def add_book(tokens): books[tokens.identifier] = key_values.copy() key_values.clear() left_brace, right_brace, comma, equals = map(Suppress, "{},=") start = Suppress("@Book") + left_brace identifier = Regex(r"[a-zA-Z][^,\s]*")("identifier") + comma key = Word(alphas, alphanums)("key") value = (Word(nums).setParseAction(lambda t: int(t[0])) | QuotedString('"', multiline=True).setParseAction(normalize) )("value") key_value = (key + equals + value).setParseAction(add_key_value) end = right_brace bibtex = (start + identifier + delimitedList(key_value) + end ).setParseAction(add_book) parser = OneOrMore(bibtex) try: parser.parseString(text) except ParseException as err: print("parse error: {0}".format(err)) return books def ply_parse(text): WHITESPACE = re.compile(r"\s+") tokens = ("START", "IDENTIFIER", "KEY", "NUMBER", "QUOTEDSTRING", "COMMA", "END") t_ignore_START = r"@Book" def t_IDENTIFIER(t): r"\{[a-zA-Z][^,\s]*" t.value = t.value[1:] return t t_KEY = r"[a-zA-Z]\w*" def t_NUMBER(t): r"=\s*\d+" t.value = int(t.value[1:].lstrip()) return t def t_QUOTEDSTRING(t): r'=\s*"[^="]+"' t.value = WHITESPACE.sub(" ", t.value[1:].lstrip()[1:-1].strip()) return t t_ignore_COMMA = r"," t_ignore_END = r"\}" t_ignore = " \t\n" def t_newline(t): r"\n+" t.lexer.lineno += len(t.value) def t_error(t): line = t.value.lstrip() i = line.find("\n") line = line if i == -1 else line[:i] print("failed to parse line {0}: {1}".format(t.lineno + 1, line)) books = {} book = key = None lexer = ply.lex.lex() lexer.input(text.replace("\n", " ")) for token in lexer: if token.type == "IDENTIFIER": books[token.value] = book = {} continue if book is None: print("missing start of book line {0}".format(token.lineno)) if token.type == "KEY": key = token.value continue if key is None: print("missing key line {0}".format(token.lineno)) if token.type in ("QUOTEDSTRING", "NUMBER"): book[key] = token.value return books def main(): books_ply = ply_parse(TEXT) books_pyparsing = pyparsing_parse(TEXT) pprint.pprint(books_pyparsing) assert books_ply == books_pyparsing main()