diff --git a/CMakeLists.txt b/CMakeLists.txt index 62f71fc..ff814a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,9 +14,27 @@ project( find_package(Python REQUIRED COMPONENTS Interpreter Development.Module) find_package(pybind11 CONFIG REQUIRED) -# Add a library using FindPython's tooling (pybind11 also provides a helper like -# this) -python_add_library(_core MODULE cpp_src/jsonparser.cpp WITH_SOABI) +# Define source files for the streaming JSON parser +set(PARSER_SOURCES + cpp_src/jsonparser.cpp +) + +# Define header files (for IDE integration) +set(PARSER_HEADERS + cpp_src/jsonparser.h +) + +# Add a library using FindPython's tooling +python_add_library(_core MODULE + cpp_src/bindings.cpp + ${PARSER_SOURCES} + WITH_SOABI +) + +# Make sure the include directory is in the include path +target_include_directories(_core PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/cpp_src) + +# Link against pybind11 target_link_libraries(_core PRIVATE pybind11::headers) # This is passing in the version as a define just as an example diff --git a/cpp_src/bindings.cpp b/cpp_src/bindings.cpp new file mode 100644 index 0000000..b0cb9e1 --- /dev/null +++ b/cpp_src/bindings.cpp @@ -0,0 +1,33 @@ + +/** + * Python bindings for the streaming JSON parser + */ + +#include "jsonparser.h" + +#include +#include +#include + + +namespace py = pybind11; + +// macro defined in pybind11.h (common.h) +PYBIND11_MODULE(_core, m) { + m.doc() = "C++ streaming JSON parser with Python bindings"; + + // Expose the StreamJsonParser class along with its + // constructor and two functions. + // Note that we use getPython for get to return a py::object + py::class_(m, "StreamingJsonParser") + .def(py::init(), py::arg("strict_mode") = false) + .def("consume", &StreamingJsonParser::consume) + .def("get", &StreamingJsonParser::getPython); + + // Expose extra function for parsing without explictly creating obj. + m.def("parse_json", [](const std::string& json_str, bool strict_mode = false) { + StreamingJsonParser parser(strict_mode); + parser.consume(json_str); + return parser.getPython(); + }, py::arg("json_str"), py::arg("strict_mode") = false); +} diff --git a/cpp_src/jsonparser.cpp b/cpp_src/jsonparser.cpp index b8a03aa..e049682 100644 --- a/cpp_src/jsonparser.cpp +++ b/cpp_src/jsonparser.cpp @@ -3,6 +3,8 @@ * With Python bindings using pybind11. */ +#include "jsonparser.h" + #include #include #include @@ -10,255 +12,121 @@ #include #include #include -#include + #include namespace py = pybind11; -class JsonValue; -class JsonString; -class JsonObject; - -/** -Our JSON values are defined by classes JsonString and -JsonObject where both are derived from JsonValue. - -JSON keys are always strings so we use std::string for those. -It's also easy to convert to python types using py::str. -*/ -class JsonValue { -public: - virtual ~JsonValue() = default; - virtual bool isString() const = 0; - virtual bool isObject() const = 0; - virtual py::object toPython() const = 0; -}; -/** -* String value in JSON -*/ -class JsonString : public JsonValue { -public: - JsonString(const std::string& value = "") : value(value) {} - - bool isString() const override { return true; } - bool isObject() const override { return false; } - - void append(char c) { value += c; } - - py::object toPython() const override { - return py::str(value); - } - -private: - std::string value; -}; +StreamingJsonParser::StreamingJsonParser(bool strict_mode) + : state(START), strict_mode(strict_mode), result(std::make_unique()) { + // Initialize the expected characters for each state + expected_chars[START] = "{"; + expected_chars[EXPECT_KEY_OR_END] = "\"}"; + expected_chars[EXPECT_COLON] = ":"; + expected_chars[EXPECT_VALUE] = "\"{"; + expected_chars[EXPECT_COMMA_OR_END] = ",}"; +} -/** -* Object value in JSON (collection of key-value pairs) -*/ -class JsonObject : public JsonValue { -public: - bool isString() const override { return false; } - bool isObject() const override { return true; } - - // Add a key-value pair - void set(const std::string& key, std::unique_ptr value) { - members[key] = std::move(value); - } - - // Check if a key exists - bool has(const std::string& key) const { - return members.find(key) != members.end(); - } - - // Get a value by key - JsonValue* get(const std::string& key) { - auto it = members.find(key); - return it != members.end() ? it->second.get() : nullptr; - } - - // Construct a python dictionary with py::objects as keys and values. - py::object toPython() const override { - py::dict result; - for (const auto& [key, value] : members) { - result[py::str(key)] = value->toPython(); +void StreamingJsonParser::consume(const std::string& buffer) { + for (const char c : buffer) { + if (isWhitespace(c) && state != IN_KEY && state != IN_VALUE) { + continue; + } + + if (strict_mode) { + auto it = expected_chars.find(state); + if (it != expected_chars.end() && it->second.find(c) == std::string::npos) { + throw std::runtime_error( + "Got " + std::string(1, c) + " but expected one of " + it->second + ); + } } - return result; + + processChar(c); } - -private: - std::unordered_map> members; -}; +} -/** -* Stack-based state machine parser for streaming JSON. -*/ -class StreamingJsonParser { -public: - // Possible states in the parsing state machine - enum State { - START = 0, // expect { - EXPECT_KEY_OR_END = 1, // expect " or } - IN_KEY = 2, // expect char or end quote " - IN_VALUE = 3, // expect char or end quote " - EXPECT_COLON = 4, // expect : - EXPECT_VALUE = 5, // expect start quote " or { - EXPECT_COMMA_OR_END = 6 // expect , or } - }; - - StreamingJsonParser(bool strict_mode = false) - : state(START), strict_mode(strict_mode), result(std::make_unique()) { - // Initialize the expected characters for each state - expected_chars[START] = "{"; - expected_chars[EXPECT_KEY_OR_END] = "\"}"; - expected_chars[EXPECT_COLON] = ":"; - expected_chars[EXPECT_VALUE] = "\"{"; - expected_chars[EXPECT_COMMA_OR_END] = ",}"; - } - - ~StreamingJsonParser() = default; - - void consume(const std::string& buffer) { - for (const char c : buffer) { - if (isWhitespace(c) && state != IN_KEY && state != IN_VALUE) { - continue; +bool StreamingJsonParser::isWhitespace(char c) const { + return c == ' ' || c == '\n' || c == '\t' || c == '\r'; +} + +void StreamingJsonParser::processChar(char c) { + JsonObject* current_obj = stack.empty() ? result.get() : stack.back(); + + switch (state) { + case START: + if (c == '{') { + state = EXPECT_KEY_OR_END; } + break; - if (strict_mode) { - auto it = expected_chars.find(state); - if (it != expected_chars.end() && it->second.find(c) == std::string::npos) { - throw std::runtime_error( - "Got " + std::string(1, c) + " but expected one of " + it->second - ); + case EXPECT_KEY_OR_END: + if (c == '"') { + state = IN_KEY; + current_key = ""; + } else if (c == '}') { + if (!stack.empty()) { + stack.pop_back(); } + state = EXPECT_COMMA_OR_END; } + break; - processChar(c); - } - } - - // Don't confuse StreamingJsonParser::get and std::unqiue_ptr::get - JsonObject* get() const { - return result.get(); - } - - // Get the result as a Python dict - py::object getPython() const { - return result->toPython(); - } - -private: - std::unique_ptr result; - // Holds a stack of pointers to JsonObjects, make sure that pointers pushed - // here have lifetimes that exceed the time on stack. - std::vector stack; - State state; - std::string current_key; - bool strict_mode; - std::unordered_map expected_chars; - - bool isWhitespace(char c) const { - return c == ' ' || c == '\n' || c == '\t' || c == '\r'; - } - - void processChar(char c) { - JsonObject* current_obj = stack.empty() ? result.get() : stack.back(); - - switch (state) { - case START: - if (c == '{') { - state = EXPECT_KEY_OR_END; - } - break; - - case EXPECT_KEY_OR_END: - if (c == '"') { - state = IN_KEY; - current_key = ""; - } else if (c == '}') { - if (!stack.empty()) { - stack.pop_back(); - } - state = EXPECT_COMMA_OR_END; - } - break; - - case IN_KEY: - if (c == '"') { - state = EXPECT_COLON; - } else { - current_key += c; - } - break; - - case IN_VALUE: - if (c == '"') { - state = EXPECT_COMMA_OR_END; - } else { - auto* value = current_obj->get(current_key); - assert(value != nullptr && value->isString() && "current_obj not inited in IN_VALUE"); - // since value is a JsonValue*, must cast to JsonString* - dynamic_cast(value)->append(c); - } - break; - - case EXPECT_COLON: - if (c == ':') { - state = EXPECT_VALUE; - } - break; - - case EXPECT_VALUE: - if (c == '"') { - // we know it's a string value so set cur_obj[cur_key] = "" - state = IN_VALUE; - current_obj->set(current_key, std::make_unique()); - } else if (c == '{') { - auto newObj = std::make_unique(); - JsonObject* objPtr = newObj.get(); - // Transfer ownership of pointer to current_obj - current_obj->set(current_key, std::move(newObj)); - // Store raw pointer (w/o ownership) on stack. current_obj has ownership - // so we have to make sure that current_obj lives longer than the element - // does on the stack: - // This is satisifed because 1) current_obj points to result which is alive - // the longest, or 2), points to the previous stack entry which by definition - // of a stack will outlive it. - stack.push_back(objPtr); - state = EXPECT_KEY_OR_END; - } - break; - - case EXPECT_COMMA_OR_END: - if (c == ',') { - state = EXPECT_KEY_OR_END; - } else if (c == '}') { - if (!stack.empty()) { - stack.pop_back(); - } - state = EXPECT_COMMA_OR_END; + case IN_KEY: + if (c == '"') { + state = EXPECT_COLON; + } else { + current_key += c; + } + break; + + case IN_VALUE: + if (c == '"') { + state = EXPECT_COMMA_OR_END; + } else { + auto* value = current_obj->get(current_key); + assert(value != nullptr && value->isString() && "current_obj not inited in IN_VALUE"); + // since value is a JsonValue*, must cast to JsonString* + dynamic_cast(value)->append(c); + } + break; + + case EXPECT_COLON: + if (c == ':') { + state = EXPECT_VALUE; + } + break; + + case EXPECT_VALUE: + if (c == '"') { + // we know it's a string value so set cur_obj[cur_key] = "" + state = IN_VALUE; + current_obj->set(current_key, std::make_unique()); + } else if (c == '{') { + auto newObj = std::make_unique(); + JsonObject* objPtr = newObj.get(); + // Transfer ownership of pointer to current_obj + current_obj->set(current_key, std::move(newObj)); + // Store raw pointer (w/o ownership) on stack. current_obj has ownership + // so we have to make sure that current_obj lives longer than the element + // does on the stack: + // This is satisifed because 1) current_obj points to result which is alive + // the longest, or 2), points to the previous stack entry which by definition + // of a stack will outlive it. + stack.push_back(objPtr); + state = EXPECT_KEY_OR_END; + } + break; + + case EXPECT_COMMA_OR_END: + if (c == ',') { + state = EXPECT_KEY_OR_END; + } else if (c == '}') { + if (!stack.empty()) { + stack.pop_back(); } - break; - } + state = EXPECT_COMMA_OR_END; + } + break; } -}; - -// macro defined in pybind11.h (common.h) -PYBIND11_MODULE(_core, m) { - m.doc() = "C++ streaming JSON parser with Python bindings"; - - // Expose the StreamJsonParser class along with its - // constructor and two functions. - // Note that we use getPython for get to return a py::object - py::class_(m, "StreamingJsonParser") - .def(py::init(), py::arg("strict_mode") = false) - .def("consume", &StreamingJsonParser::consume) - .def("get", &StreamingJsonParser::getPython); - - // Expose extra function for parsing without explictly creating obj. - m.def("parse_json", [](const std::string& json_str, bool strict_mode = false) { - StreamingJsonParser parser(strict_mode); - parser.consume(json_str); - return parser.getPython(); - }, py::arg("json_str"), py::arg("strict_mode") = false); } diff --git a/cpp_src/jsonparser.h b/cpp_src/jsonparser.h new file mode 100644 index 0000000..acf66e5 --- /dev/null +++ b/cpp_src/jsonparser.h @@ -0,0 +1,136 @@ + +/** +* StreamingJsonParser: A C++ implementation of a streaming JSON parser. +* With Python bindings using pybind11. +*/ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace py = pybind11; + +class JsonValue; +class JsonString; +class JsonObject; + +/** +Our JSON values are defined by classes JsonString and +JsonObject where both are derived from JsonValue. + +JSON keys are always strings so we use std::string for those. +It's also easy to convert to python types using py::str. +*/ +class JsonValue { +public: + virtual ~JsonValue() = default; + virtual bool isString() const = 0; + virtual bool isObject() const = 0; + virtual py::object toPython() const = 0; +}; + +/** +* String value in JSON +*/ +class JsonString : public JsonValue { +public: + JsonString(const std::string& value = "") : value(value) {} + + bool isString() const override { return true; } + bool isObject() const override { return false; } + + void append(char c) { value += c; } + + py::object toPython() const override { + return py::str(value); + } + +private: + std::string value; +}; + +/** +* Object value in JSON (collection of key-value pairs) +*/ +class JsonObject : public JsonValue { +public: + bool isString() const override { return false; } + bool isObject() const override { return true; } + + // Add a key-value pair + void set(const std::string& key, std::unique_ptr value) { + members[key] = std::move(value); + } + + // Check if a key exists + bool has(const std::string& key) const { + return members.find(key) != members.end(); + } + + // Get a value by key + JsonValue* get(const std::string& key) { + auto it = members.find(key); + return it != members.end() ? it->second.get() : nullptr; + } + + // Construct a python dictionary with py::objects as keys and values. + py::object toPython() const override { + py::dict result; + for (const auto& [key, value] : members) { + result[py::str(key)] = value->toPython(); + } + return result; + } + +private: + std::unordered_map> members; +}; + +/** +* Stack-based state machine parser for streaming JSON. +*/ +class StreamingJsonParser { +public: + // Possible states in the parsing state machine + enum State { + START = 0, // expect { + EXPECT_KEY_OR_END = 1, // expect " or } + IN_KEY = 2, // expect char or end quote " + IN_VALUE = 3, // expect char or end quote " + EXPECT_COLON = 4, // expect : + EXPECT_VALUE = 5, // expect start quote " or { + EXPECT_COMMA_OR_END = 6 // expect , or } + }; + + StreamingJsonParser(bool strict_mode = false); + ~StreamingJsonParser() = default; + + void consume(const std::string& buffer); + + // Don't confuse StreamingJsonParser::get and std::unqiue_ptr::get + JsonObject* get() const { + return result.get(); + } + + // Get the result as a Python dict + py::object getPython() const { + return result->toPython(); + } + +private: + std::unique_ptr result; + // Holds a stack of pointers to JsonObjects, make sure that pointers pushed + // here have lifetimes that exceed the time on stack. + std::vector stack; + State state; + std::string current_key; + bool strict_mode; + std::unordered_map expected_chars; + + bool isWhitespace(char c) const; + void processChar(char c); +};