An extended JSON parser implemented as a recursive descent parser


#!/usr/bin/env python3 class ParseError(Exception): def __init__(self, pos, msg, *args): self.pos = pos self.msg = msg self.args = args def __str__(self): return '%s at position %s' % (self.msg % self.args, self.pos) class Parser: def __init__(self): self.cache = {} def parse(self, text): self.text = text self.pos = -1 self.len = len(text) - 1 rv = self.start() self.assert_end() return rv def assert_end(self): if self.pos < self.len: raise ParseError( self.pos + 1, 'Expected end of string but got %s', self.text[self.pos + 1] ) def eat_whitespace(self): while self.pos < self.len and self.text[self.pos + 1] in " \f\v\r\t\n": self.pos += 1 def split_char_ranges(self, chars): try: return self.cache[chars] except KeyError: pass rv = [] index = 0 length = len(chars) while index < length: if index + 2 < length and chars[index + 1] == '-': if chars[index] >= chars[index + 2]: raise ValueError('Bad character range') rv.append(chars[index:index + 3]) index += 3 else: rv.append(chars[index]) index += 1 self.cache[chars] = rv return rv def char(self, chars=None): if self.pos >= self.len: raise ParseError( self.pos + 1, 'Expected %s but got end of string', 'character' if chars is None else '[%s]' % chars ) next_char = self.text[self.pos + 1] if chars == None: self.pos += 1 return next_char for char_range in self.split_char_ranges(chars): if len(char_range) == 1: if next_char == char_range: self.pos += 1 return next_char elif char_range[0] <= next_char <= char_range[2]: self.pos += 1 return next_char raise ParseError( self.pos + 1, 'Expected %s but got %s', 'character' if chars is None else '[%s]' % chars, next_char ) def keyword(self, *keywords): self.eat_whitespace() if self.pos >= self.len: raise ParseError( self.pos + 1, 'Expected %s but got end of string', ','.join(keywords) ) for keyword in keywords: low = self.pos + 1 high = low + len(keyword) if self.text[low:high] == keyword: self.pos += len(keyword) self.eat_whitespace() return keyword raise ParseError( self.pos + 1, 'Expected %s but got %s', ','.join(keywords), self.text[self.pos + 1], ) def match(self, *rules): self.eat_whitespace() last_error_pos = -1 last_exception = None last_error_rules = [] for rule in rules: initial_pos = self.pos try: rv = getattr(self, rule)() self.eat_whitespace() return rv except ParseError as e: self.pos = initial_pos if e.pos > last_error_pos: last_exception = e last_error_pos = e.pos last_error_rules.clear() last_error_rules.append(rule) elif e.pos == last_error_pos: last_error_rules.append(rule) if len(last_error_rules) == 1: raise last_exception else: raise ParseError( last_error_pos, 'Expected %s but got %s', ','.join(last_error_rules), self.text[last_error_pos] ) def maybe_char(self, chars=None): try: return self.char(chars) except ParseError: return None def maybe_match(self, *rules): try: return self.match(*rules) except ParseError: return None def maybe_keyword(self, *keywords): try: return self.keyword(*keywords) except ParseError: return None class JSONParser(Parser): def eat_whitespace(self): is_processing_comment = False while self.pos < self.len: char = self.text[self.pos + 1] if is_processing_comment: if char == '\n': is_processing_comment = False else: if char == '#': is_processing_comment = True elif char not in ' \f\v\r\t\n': break self.pos += 1 def start(self): return self.match('any_type') def any_type(self): return self.match('complex_type', 'primitive_type') def primitive_type(self): return self.match('null', 'boolean', 'quoted_string', 'unquoted') def complex_type(self): return self.match('list', 'map') def list(self): rv = [] self.keyword('[') while True: item = self.maybe_match('any_type') if item is None: break rv.append(item) if not self.maybe_keyword(','): break self.keyword(']') return rv def map(self): rv = {} self.keyword('{') while True: item = self.maybe_match('pair') if item is None: break rv[item[0]] = item[1] if not self.maybe_keyword(','): break self.keyword('}') return rv def pair(self): key = self.match('quoted_string', 'unquoted') if type(key) is not str: raise ParseError( self.pos + 1, 'Expected string but got number', self.text[self.pos + 1] ) self.keyword(':') value = self.match('any_type') return key, value def null(self): self.keyword('null') return None def boolean(self): boolean = self.keyword('true', 'false') return boolean[0] == 't' def unquoted(self): acceptable_chars = '0-9A-Za-z \t!$%&()*+./;<=>?^_`|~-' number_type = int chars = [self.char(acceptable_chars)] while True: char = self.maybe_char(acceptable_chars) if char is None: break if char in 'Ee.': number_type = float chars.append(char) rv = ''.join(chars).rstrip(' \t') try: return number_type(rv) except ValueError: return rv def quoted_string(self): quote = self.char('"\'') chars = [] escape_sequences = { 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', } while True: char = self.char() if char == quote: break elif char == '\\': escape = self.char() if escape == 'u': code_point = [] for i in range(4): code_point.append(self.char('0-9a-fA-F')) chars.append(chr(int(''.join(code_point), 16))) else: chars.append(escape_sequences.get(char, char)) else: chars.append(char) return ''.join(chars) import sys from pprint import pprint parser = JSONParser() try: pprint(parser.parse(sys.stdin.read())) except ParseError as e: print('Error: '+ str(e))
Output
(Run the program to view its output)
Comments