A boost regex library approach to parsing JSON-format SPARQL results.
(Quick and dirtly syntax colorizing with emacs htmlize-buffer.)
/** JSONparser.cpp - boost::regex parser for the SPARQL JSON results format. * input: strings defined by <http://www.w3.org/TR/rdf-sparql-json-res/>. * requires: * boost::regex - regular expression library. * boost::optional - optionally initialized container, like Haskel Maybe. * author: ericP, Eric Prud'hommeaux . * license: Apache License, Version 2.0, free like speech. */ #include <iostream> #include <fstream> #include <sstream> #include <algorithm> #include <boost/regex.hpp> #include <boost/optional.hpp> /** ToUTF8 - convert ordinal references to UTF-8 sequences. * Out - an input_stream or something with a similar incremental operator=. */ struct ToUTF8 { struct InvalidUCScode : public std::runtime_error { unsigned int code; InvalidUCScode (unsigned int code) : runtime_error(codeString(code)), code(code) {} static std::string codeString (unsigned int code) { // use boost::lexical_cast here if you're already including it. std::stringstream ss; ss << "invalid UCS code: " << std::hex << code; return std::string(ss.str()); } }; template<typename Out> Out operator() (const char* s, Out out) const { return operator()(strtol(s, (char**) NULL, 16), out); } template<typename Out> Out operator() (unsigned int ord, Out out) const throw(InvalidUCScode) { // http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html if (ord < 0x80) { out = ord >> 0 & 0x7F | 0x00; } else if (ord < 0x0800) { out = ord >> 6 & 0x1F | 0xC0; out = ord >> 0 & 0x3F | 0x80; } else if (ord < 0x010000) { out = ord >> 12 & 0x0F | 0xE0; out = ord >> 6 & 0x3F | 0x80; out = ord >> 0 & 0x3F | 0x80; } else if (ord < 0x110000) { out = ord >> 18 & 0x07 | 0xF0; out = ord >> 12 & 0x3F | 0x80; out = ord >> 6 & 0x3F | 0x80; out = ord >> 0 & 0x3F | 0x80; } else { throw InvalidUCScode(ord); } return out; } }; /** JSONstringToUTF8 - convert JSON's escaped string sequences to UTF-8. */ struct JSONstringToUTF8 { struct Replacer : public ToUTF8 { char operator() (char ch) const { switch (ch) { case 'b': return '\b'; case 'f': return '\f'; case 'n': return '\n'; case 'r': return '\r'; case 't': return '\t'; #if STRIDENT // Police undefined escape sequences. case '\\': return '\\'; case '"': return '"'; case '/': return '/'; default: throw std::runtime_error(std::string() + "unknown escape charater \"" + ch + "\""); #else // Pass undefined escape sequences as the escaped character. default: return ch; #endif } } template<typename Out> Out operator() (char ch, Out out) const { out = operator()(ch); return out; } /** Functor invoked by boost::regex_replace for each matched pattern. */ template<typename Out> // boost::re_detail::string_out_iterator<std::string> Out operator() (boost::smatch const &what, Out out) const { return what[2].first == what[2].second // First group captures single character escapes: "\/bfnrt ? operator()(*(what[1].first), out) // Second group captures codepoints: 0022 : ToUTF8::operator()(std::string(what[2].first, what[2].second).c_str(), out); } }; /** Unescape all JSON escape sequences in s. */ std::string operator() (std::string s) { // Look for JSON escape sequences. boost::regex e("\\\\(?:([\"\\\\/bfnrt])|([0-9a-fA-F]{4}))"); // Call Replacer () on each match. return regex_replace(s, e, Replacer(), boost::match_default); } /** Unescape all JSON escape sequences in range from, to. */ std::string operator() (std::string::const_iterator from, std::string::const_iterator to) { return operator()(std::string(from, to)); } }; struct JSONparser { // Iterate through the input stream. std::string::const_iterator from, to; // boost:::regex structure of what was just matched. boost::match_results<std::string::const_iterator> what; boost::match_flag_type flags; JSONparser () : flags(boost::match_perl|boost::match_single_line) { } /* * Convenience functions. */ std::string stringToCurlyNotation (std::string s) { std::string ret; for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) switch (*it) { case '\b': ret += "\\b"; break; case '\f': ret += "\\f"; break; case '\n': ret += "\\n"; break; case '\r': ret += "\\r"; break; case '\t': ret += "\\t"; break; case '\\': ret += "\\\\"; break; case '"': ret += "\\\""; break; default: ret += *it; } return ret; } void expect (boost::regex e) { if (!regex_search(from, to, what, e, flags)) { std::string sample(what[0].first, what[0].first + 20); throw std::runtime_error(std::string() + "\"" + sample + "\" didn't match \"" + e.str() + "\""); } from = what[0].second; } bool askFor (boost::regex e) { if (!regex_search(from, to, what, e, flags)) return false; from = what[0].second; return true; } /* Parse an istream and cout out {?var→term, …} patterns. */ void operator() (std::istream& in) { const std::string s((std::istreambuf_iterator<char>(in)), std::istreambuf_iterator<char>()); from = s.begin(); to = s.end(); boost::optional<std::string> link; /* Parse the head. */ expect(boost::regex("\\A[ \\n]*\\{[ \\n]*")); expect(boost::regex("\\A\"head\" *: *\\{[ \\n]*")); if (askFor(boost::regex("\\A\"link\"[ \\n]*:[ \\n]*\\[[ \\n]*\"([^\"]+)\"[ \\n]*\\][ \\n]*,[ \\n]*"))) link = std::string(what[1].first, what[1].second); expect(boost::regex("\\A\"vars\" *: *\\[[ \\n]*(\"([^\"]+)\")(?:[ \\n]*,[ \\n]*\"([^\"]+)\")*[ \\n]*\\][ \\n]*")); // Are links allowed before AND after "vars"? if (!link.is_initialized() && askFor(boost::regex("\\A\"link\"[ \\n]*:[ \\n]*\\[[ \\n]*\"([^\"]+)\"[ \\n]*\\][ \\n]*,[ \\n]*"))) link = std::string(what[1].first, what[1].second); if (link.is_initialized()) std::cout << "link: " << *link << "\n"; /* Parser the bindings. */ expect(boost::regex("\\A\\}[ \\n]*,[ \\n]*\"results\" *: *\\{[ \\n]*\"bindings\" *: *\\[[ \\n]*")); { const boost::regex close("\\A[ \\n]*\\}[ \\n]*(?:,[ \\n]*)?"); // For each solution, const boost::regex open("\\A[ \\n]*\\{[ \\n]*"); while (regex_search(from, to, what, open, flags)) { from = what[0].second; std::cout << "{"; int varNo = 0; // for printing ','s // For each binding, const boost::regex var_r("\\A\"([^\"]+)\"[ \\n]*:[ \\n]*\\{[ \\n]*"); while (regex_search(from, to, what, var_r, flags)) { from = what[0].second; JSONstringToUTF8 unescape; // the variable being bound: std::string var(unescape(what[1].first, what[1].second)); #ifdef ORDERED while (askFor(boost::regex("\\A\"type\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?"))) { std::string type(unescape(what[1].first, what[1].second)); // catches e.g. "ur\0069" expect(boost::regex("\\A\"value\"[ \\n]*:[ \\n]*\"((?:[^\\\\\"]|\\\\[\"nrtb])*)\"[ \\n]*(,[ \\n]*)?")); // taking stringToCurlyNotation(unescape(X)) to be X due to similar encoding rules std::string value(what[1].first, what[1].second); if (varNo++ > 0) std::cout << ", "; std::cout << "?" << var + "→"; if (type == "uri") std::cout << "<" + value + ">"; else if (type == "bnode") std::cout << "_:" + value; else if (type == "literal") { std::cout << "\"" + value + "\""; if (askFor(boost::regex("\\A\"xml:lang\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?"))) std::cout << "@" << std::string(what[1].first, what[1].second); } else if (type == "typed-literal") { std::cout << "\"" + value + "\""; expect(boost::regex("\\A\"datatype\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?")); std::cout << "^^<" << std::string(what[1].first, what[1].second) << ">"; } else { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "unknown SPARQL JSON type \"" + type + "\" in \"" + sample + "\"."); } } expect(close); #else /* !ORDERED */ // what we'll learn about that binding: enum {type_UNSET, type_URI, type_BNODE, type_LITERAL, type_TYPEDLITERAL} type = type_UNSET; boost::optional<std::string> value, datatype, lang; // Until the solution is closed with a close curly, while (!regex_search(from, to, what, close, flags)) { // Look for the four binding specifiers. const boost::regex type_r("\\A\"type\"[ \\n]*:[ \\n]*\"(uri|bnode|literal|typed-literal)\"[ \\n]*(,[ \\n]*)?"); const boost::regex value_r("\\A\"value\"[ \\n]*:[ \\n]*\"((?:[^\\\\\"]|\\\\[\"nrtb])*)\"[ \\n]*(,[ \\n]*)?"); const boost::regex dtype_r("\\A\"datatype\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?"); const boost::regex lang_r("\\A\"xml:lang\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?"); /* Match and validate each directive. */ if (regex_search(from, to, what, type_r, flags)) { if (type != type_UNSET) { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "\"" + sample + "\" is a duplicate type directive."); } std::string typeStr(unescape(what[1].first, what[1].second)); // Assign type enum per parsed type directive. if (typeStr == "uri") type = type_URI; else if (typeStr == "bnode") type = type_BNODE; else if (typeStr == "literal") type = type_LITERAL; else if (typeStr == "typed-literal") type = type_TYPEDLITERAL; else { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "unknown SPARQL JSON type \"" + typeStr + "\" in \"" + sample + "\"."); } // Check for compatibility with already parsed directives if (datatype.is_initialized() && type != type_TYPEDLITERAL) { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "datatype directive only permitted for typed-literal at \"" + sample + "\"."); } if (lang.is_initialized() && type != type_LITERAL) { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "xml:lang directive only permitted for literal at \"" + sample + "\"."); } } else if (regex_search(from, to, what, value_r, flags)) { if (value.is_initialized()) { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "\"" + sample + "\" is a duplicate value directive."); } value = unescape(what[1].first, what[1].second); } else if (regex_search(from, to, what, dtype_r, flags)) { if (datatype.is_initialized()) { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "\"" + sample + "\" is a duplicate dtype directive."); } if (type != type_UNSET && type != type_TYPEDLITERAL) { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "datatype directive only permitted for typed-literal at \"" + sample + "\"."); } datatype = unescape(what[1].first, what[1].second); } else if (regex_search(from, to, what, lang_r, flags)) { if (lang.is_initialized()) { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "\"" + sample + "\" is a duplicate lang directive."); } if (type != type_UNSET && type != type_LITERAL) { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "xml:lang directive only permitted for literal at \"" + sample + "\"."); } lang = unescape(what[1].first, what[1].second); } else { std::string sample(what[0].first, what[0].first + 80); throw std::runtime_error(std::string() + "\"" + sample + "\" isn't a type, value, datatype or xml:lang directive."); } from = what[0].second; } from = what[0].second; // get past the close pattern if (!value.is_initialized()) throw std::runtime_error(std::string() + "no value set for variable \"" + var + "\"."); if (varNo++ > 0) std::cout << ", "; std::cout << "?" << var + "→"; switch (type) { case type_UNSET: throw std::runtime_error(std::string() + "no node type set for variable \"" + var + "\"."); case type_URI: std::cout << "<" + stringToCurlyNotation(*value) + ">"; break; case type_BNODE: std::cout << "_:" + stringToCurlyNotation(*value); break; case type_LITERAL: std::cout << "\"" + stringToCurlyNotation(*value) + "\""; if (lang.is_initialized()) std::cout << "@" << *lang; break; case type_TYPEDLITERAL: if (!datatype.is_initialized()) throw std::runtime_error(std::string() + "no xml:type set for variable \"" + var + "\"."); std::cout << "\"" + stringToCurlyNotation(*value) + "\"^^<" + *datatype + ">"; break; } #endif /* !ORDERED */ } regex_search(from, to, what, close, flags); from = what[0].second; // get past the close pattern std::cout << "}\n"; } } /* Make sure we didn't get lost. */ expect(boost::regex("\\A\\][ \\n]*\\}[ \\n]*\\}[ \\n]*")); if (from != to) { std::string rest(from, to); throw std::runtime_error(std::string() + "garbage found at end of stream \"" + rest + "\""); } } }; int main (int argc, const char* argv[]) { try { // Test UTF8 handling. try { ToUTF8 toUTF8; std::stringstream ss; toUTF8(0x98df, std::ostreambuf_iterator<char>(ss)); // Shoku (food) assert (ss.str() == "食"); toUTF8(0x10ffff, std::ostreambuf_iterator<char>(ss)); // just fits. toUTF8(0x110000, std::ostreambuf_iterator<char>(ss)); // too large. assert (false); } catch (ToUTF8::InvalidUCScode& ex) { assert (ex.code == 0x110000); // expected exception. } catch (...) { assert (false); // unexpected exception. } // Test JSON escape parsing. JSONstringToUTF8 unescape; assert (unescape(std::string("hello \\\"world\\\"\\nab\\0022\\98dfcd")) == "hello \"world\"\nab\"食cd"); // Parse some JSON. std::ifstream ifs("asdf.json", std::ios::binary); if (!ifs.is_open()) throw std::runtime_error("couldn't open asdf.json"); JSONparser p; p(ifs); } catch (std::runtime_error const& e) { std::cerr << "std::runtime_error: " << e.what() << "\n"; } catch (std::string const& e) { std::cerr << "std::string: " << e << "\n"; } catch (...) { std::cerr << "˙buoɹʍ ʎɹǝʌ ʇuǝʍ buıɥʇǝɯos\n"; } return 0; }