🎉 JSON parser from scratch chapter is now out!
JSON parser
Wrapping up the parser

Wrapping up

Here are some other challenges worth going into

  • better error reporting (location, reason for failure etc..)
  • converting the AST back into a python dictionary

Maybe next up, we can write a prettier or beautifier for JSON!

Codebase

from dataclasses import dataclass
from enum import Enum
 
 
class TokenType(Enum):
    BRACE_OPEN = "BraceOpen"
    BRACE_CLOSE = "BraceClose"
    BRACKET_OPEN = "BracketOpen"
    BRACKET_CLOSE = "BracketClose"
    STRING = "String"
    NUMBER = "Number"
    COMMA = "Comma"
    COLON = "Colon"
    TRUE = "True"
    FALSE = "False"
    NULL = "Null"
 
 
class ASTNodeType(Enum):
    BOOLEAN = "BOOLEAN"
    STRING = "STRING"
    NULL = "NULL"
    OBJECT = "OBJECT"
 
 
@dataclass
class Token:
    token_type: TokenType
    value: str
 
 
@dataclass
class ASTNode:
    node_type: ASTNodeType
    value: any
 
 
def tokenize(raw_string: str) -> list[Token]:
    tokens = []
    current_pos = 0
    while current_pos < len(raw_string):
        char_at_pos = raw_string[current_pos]
        if char_at_pos == " " or char_at_pos == "\n":
            current_pos = current_pos + 1
        elif char_at_pos == "{":
            tokens.append(Token(TokenType.BRACE_OPEN, value=char_at_pos))
            current_pos = current_pos + 1
        elif char_at_pos == "}":
            tokens.append(Token(TokenType.BRACE_CLOSE, value=char_at_pos))
            current_pos = current_pos + 1
        elif char_at_pos == ":":
            tokens.append(Token(TokenType.COLON, value=char_at_pos))
            current_pos = current_pos + 1
        elif char_at_pos == ",":
            tokens.append(Token(TokenType.COMMA, value=char_at_pos))
            current_pos = current_pos + 1
        elif char_at_pos == "[":
            tokens.append(Token(TokenType.BRACKET_OPEN, value=char_at_pos))
            current_pos = current_pos + 1
        elif char_at_pos == "]":
            tokens.append(Token(TokenType.BRACKET_CLOSE, value=char_at_pos))
            current_pos = current_pos + 1
        elif char_at_pos == ":":
            tokens.append(Token(TokenType.COLON, value=char_at_pos))
            current_pos = current_pos + 1
        elif char_at_pos == '"':
            string_contents = ""
            current_pos += 1
            char_at_pos = raw_string[current_pos]
            while char_at_pos != '"':
                current_pos += 1
                string_contents += char_at_pos
                char_at_pos = raw_string[current_pos]
            tokens.append(Token(TokenType.STRING, string_contents))
            current_pos = current_pos + 1
        elif raw_string[current_pos].isalnum():
            string_contents = ""
            while raw_string[current_pos].isalnum():
                string_contents += raw_string[current_pos]
                current_pos += 1
            if string_contents == "true":
                tokens.append(Token(token_type=TokenType.TRUE, value=string_contents))
            elif string_contents == "false":
                tokens.append(Token(token_type=TokenType.FALSE, value=string_contents))
            elif string_contents == "null":
                tokens.append(Token(token_type=TokenType.NULL, value=string_contents))
            elif string_contents.isnumeric():
                tokens.append(Token(token_type=TokenType.NUMBER, value=string_contents))
            else:
                raise Exception(f"parsing failed")
        else:
            raise Exception(f"parsing failed")
    return tokens
 
 
class Parser:
    current = 0
 
    def __init__(self, tokens: list[Token]):
        self.tokens = tokens
 
    def advance(self):
        self.current += 1
        return self.tokens[self.current]
 
    def parse_arr(self):
        arr = []
        node_after_advance = self.advance()  # eat the open brace
        while node_after_advance.token_type != TokenType.BRACKET_CLOSE:
            value = self.parse_value()
            arr.append(value)
 
            node_after_advance = self.advance()  # eat the comma after the value
 
            if node_after_advance.token_type == TokenType.COMMA:
                node_after_advance = self.advance()
        return arr
 
    def parse_object(self):
        node = ASTNode(node_type=ASTNodeType.OBJECT, value={})
        node_after_advance = self.advance()  # eat {
 
        while node_after_advance.token_type != TokenType.BRACE_CLOSE:
            if node_after_advance.token_type == TokenType.STRING:
                key = node_after_advance.value
                node_after_advance = self.advance()
                if node_after_advance.token_type != TokenType.COLON:
                    raise Exception("colon expected")
                node_after_advance = self.advance()  # move after the colon
                value = self.parse_value()
                node.value[key] = value
            else:
                raise Exception("expected a string in key place")
 
            node_after_advance = self.advance()  # move after the brace close
            if node_after_advance.token_type == TokenType.COMMA:
                node_after_advance = self.advance()
 
        return node
 
    def parse_value(self):
        token = self.tokens[self.current]
 
        if token.token_type == TokenType.STRING:
            return ASTNode(node_type=ASTNodeType.STRING, value=token.value)
        elif token.token_type == TokenType.NULL:
            return ASTNode(node_type=ASTNodeType.NULL, value=None)
        elif token.token_type == TokenType.TRUE:
            return ASTNode(node_type=ASTNodeType.BOOLEAN, value=True)
        elif token.token_type == TokenType.FALSE:
            return ASTNode(node_type=ASTNodeType.BOOLEAN, value=False)
        elif token.token_type == TokenType.NUMBER:
            return ASTNode(node_type=ASTNodeType.STRING, value=int(token.value))
        elif token.token_type == TokenType.BRACE_OPEN:
            return self.parse_object()
 
        elif token.token_type == TokenType.BRACKET_OPEN:
            return self.parse_arr()
        else:
            raise Exception("unknown type")
 
    def parse(self):
        return self.parse_value()
 
 
if __name__ == "__main__":
    tokenization_output = tokenize(
        """{
  "id": "6",
  "index": 0,
  "anArray": [{"arrayPos": 0},],
  "boolean": true,
  "nullValue": null
}"""
    )
    parsed_values = Parser(tokens=tokenization_output).parse()
    print(
        parsed_values,
    )