regexParser.cpp

A boost regex library approach to parsing JSON-format SPARQL results.
(Quick and dirtly syntax colorizing with emacs htmlize-buffer.)
/** JSONparser.cpp - boost::regex parser for the SPARQL JSON results format.
 * input: strings defined by <http://www.w3.org/TR/rdf-sparql-json-res/>.
 * requires:
 *   boost::regex - regular expression library.
 *   boost::optional - optionally initialized container, like Haskel Maybe.
 * author: ericP, Eric Prud'hommeaux .
 * license: Apache License, Version 2.0, free like speech.
 */

#include <iostream>
#include <fstream>
#include <sstream>
#include <algorithm>
#include <boost/regex.hpp>
#include <boost/optional.hpp>

/** ToUTF8 - convert ordinal references to UTF-8 sequences.
 * Out - an input_stream or something with a similar incremental operator=.
 */
struct ToUTF8
{
    struct InvalidUCScode : public std::runtime_error {
        unsigned int code;
        InvalidUCScode (unsigned int code) : runtime_error(codeString(code)), code(code) {}
        static std::string codeString (unsigned int code) {
            // use boost::lexical_cast here if you're already including it.
            std::stringstream ss;
            ss << "invalid UCS code: " << std::hex << code;
            return std::string(ss.str());
        }
    };

    template<typename Out>
    Out operator() (const char* s, Out out) const {
        return operator()(strtol(s, (char**) NULL, 16), out);
    }

    template<typename Out>
    Out operator() (unsigned int ord, Out out) const throw(InvalidUCScode) {
        // http://www.herongyang.com/Unicode/UTF-8-UTF-8-Encoding-Algorithm.html
        if (ord < 0x80) {
            out = ord >> 0  & 0x7F | 0x00;
        } else if (ord < 0x0800) {
            out = ord >> 6  & 0x1F | 0xC0;
            out = ord >> 0  & 0x3F | 0x80;
        } else if (ord < 0x010000) {
            out = ord >> 12 & 0x0F | 0xE0;
            out = ord >> 6  & 0x3F | 0x80;
            out = ord >> 0  & 0x3F | 0x80;
        } else if (ord < 0x110000) {
            out = ord >> 18 & 0x07 | 0xF0;
            out = ord >> 12 & 0x3F | 0x80;
            out = ord >> 6  & 0x3F | 0x80;
            out = ord >> 0  & 0x3F | 0x80;
        } else {
            throw InvalidUCScode(ord);
        }
        return out;
    }
};

/** JSONstringToUTF8 - convert JSON's escaped string sequences to UTF-8.
 */
struct JSONstringToUTF8 {
    struct Replacer : public ToUTF8 {
        char operator() (char ch) const {
            switch (ch) {
            case  'b': return '\b';
            case  'f': return '\f';
            case  'n': return '\n';
            case  'r': return '\r';
            case  't': return '\t';
#if STRIDENT
            // Police undefined escape sequences.
            case '\\': return '\\';
            case  '"': return  '"';
            case  '/': return  '/';
            default: throw std::runtime_error(std::string()
                                              + "unknown escape charater \""
                                              + ch + "\"");
#else
            // Pass undefined escape sequences as the escaped character.
            default: return ch;
#endif
            }
        }

        template<typename Out>
        Out operator() (char ch, Out out) const {
            out = operator()(ch);
            return out;
        }

        /** Functor invoked by boost::regex_replace for each matched pattern.
         */
        template<typename Out> // boost::re_detail::string_out_iterator<std::string>
        Out operator() (boost::smatch const &what, Out out) const {
            return what[2].first == what[2].second
                // First group captures single character escapes: "\/bfnrt
                ? operator()(*(what[1].first), out)
                // Second group captures codepoints: 0022
                : ToUTF8::operator()(std::string(what[2].first, what[2].second).c_str(), out);
        }

    };
    /** Unescape all JSON escape sequences in s.
     */
    std::string operator() (std::string s) {
        // Look for JSON escape sequences.
        boost::regex e("\\\\(?:([\"\\\\/bfnrt])|([0-9a-fA-F]{4}))");
        // Call Replacer () on each match.
        return regex_replace(s, e, Replacer(), boost::match_default);
    }

    /** Unescape all JSON escape sequences in range from, to.
     */
    std::string operator() (std::string::const_iterator from,
                            std::string::const_iterator to) {
        return operator()(std::string(from, to));
    }

};

struct JSONparser {

    // Iterate through the input stream.
    std::string::const_iterator from, to;
    // boost:::regex structure of what was just matched.
    boost::match_results<std::string::const_iterator> what;
    boost::match_flag_type flags;

    JSONparser ()
        : flags(boost::match_perl|boost::match_single_line)
    {  }

    /*
     * Convenience functions.
     */
    std::string stringToCurlyNotation (std::string s) {
        std::string ret;
        for (std::string::const_iterator it = s.begin();
             it != s.end(); ++it)
            switch (*it) {
            case '\b': ret +=  "\\b"; break;
            case '\f': ret +=  "\\f"; break;
            case '\n': ret +=  "\\n"; break;
            case '\r': ret +=  "\\r"; break;
            case '\t': ret +=  "\\t"; break;
            case '\\': ret += "\\\\"; break;
            case  '"': ret += "\\\""; break;
            default:   ret += *it;
            }
        return ret;
    }
    void expect (boost::regex e) {
        if (!regex_search(from, to, what, e, flags)) {
            std::string sample(what[0].first, what[0].first + 20);
            throw std::runtime_error(std::string() + "\"" + sample
                                     + "\" didn't match \"" + e.str() + "\"");
        }
        from = what[0].second;
    }
    bool askFor (boost::regex e) {
        if (!regex_search(from, to, what, e, flags))
            return false;
        from = what[0].second;
        return true;
    }

    /* Parse an istream and cout out {?var→term, …} patterns.
     */
    void operator() (std::istream& in) {
        const std::string s((std::istreambuf_iterator<char>(in)),
                            std::istreambuf_iterator<char>());
        from = s.begin();
        to = s.end();
        boost::optional<std::string> link;

        /* Parse the head.
         */
        expect(boost::regex("\\A[ \\n]*\\{[ \\n]*"));
        expect(boost::regex("\\A\"head\" *: *\\{[ \\n]*"));
        if (askFor(boost::regex("\\A\"link\"[ \\n]*:[ \\n]*\\[[ \\n]*\"([^\"]+)\"[ \\n]*\\][ \\n]*,[ \\n]*")))
            link = std::string(what[1].first, what[1].second);
        expect(boost::regex("\\A\"vars\" *: *\\[[ \\n]*(\"([^\"]+)\")(?:[ \\n]*,[ \\n]*\"([^\"]+)\")*[ \\n]*\\][ \\n]*"));
        // Are links allowed before AND after "vars"?
        if (!link.is_initialized() &&
            askFor(boost::regex("\\A\"link\"[ \\n]*:[ \\n]*\\[[ \\n]*\"([^\"]+)\"[ \\n]*\\][ \\n]*,[ \\n]*")))
            link = std::string(what[1].first, what[1].second);

        if (link.is_initialized())
            std::cout << "link: " << *link << "\n";

        /* Parser the bindings.
         */
        expect(boost::regex("\\A\\}[ \\n]*,[ \\n]*\"results\" *: *\\{[ \\n]*\"bindings\" *: *\\[[ \\n]*"));
        {
            const boost::regex close("\\A[ \\n]*\\}[ \\n]*(?:,[ \\n]*)?");

            // For each solution,
            const boost::regex open("\\A[ \\n]*\\{[ \\n]*");
            while (regex_search(from, to, what, open, flags)) {
                from = what[0].second;
                std::cout << "{";
                int varNo = 0; // for printing ','s

                // For each binding,
                const boost::regex var_r("\\A\"([^\"]+)\"[ \\n]*:[ \\n]*\\{[ \\n]*");
                while (regex_search(from, to, what, var_r, flags)) {
                    from = what[0].second;
                    JSONstringToUTF8 unescape;

                    // the variable being bound:
                    std::string var(unescape(what[1].first, what[1].second));
#ifdef ORDERED
                    while (askFor(boost::regex("\\A\"type\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?"))) {
                        std::string type(unescape(what[1].first, what[1].second)); // catches e.g. "ur\0069"
                        expect(boost::regex("\\A\"value\"[ \\n]*:[ \\n]*\"((?:[^\\\\\"]|\\\\[\"nrtb])*)\"[ \\n]*(,[ \\n]*)?"));
                        // taking stringToCurlyNotation(unescape(X)) to be X due to similar encoding rules
                        std::string value(what[1].first, what[1].second);

                        if (varNo++ > 0)
                            std::cout << ", ";
                        std::cout << "?" << var + "→";

                        if (type == "uri")
                            std::cout << "<" + value + ">";
                        else if (type == "bnode")
                            std::cout << "_:" + value;
                        else if (type == "literal") {
                            std::cout << "\"" + value + "\"";
                            if (askFor(boost::regex("\\A\"xml:lang\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?")))
                                std::cout << "@" << std::string(what[1].first, what[1].second);
                        } else if (type == "typed-literal") {
                            std::cout << "\"" + value + "\"";
                            expect(boost::regex("\\A\"datatype\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?"));
                            std::cout << "^^<" << std::string(what[1].first, what[1].second) << ">";
                        } else {
                            std::string sample(what[0].first, what[0].first + 80);
                            throw std::runtime_error(std::string() + "unknown SPARQL JSON type \""
                                                     + type + "\" in \"" + sample + "\".");
                        }
                    }
                    expect(close);
#else /* !ORDERED */
                    // what we'll learn about that binding:
                    enum {type_UNSET, type_URI, type_BNODE, type_LITERAL, type_TYPEDLITERAL} type = type_UNSET;
                    boost::optional<std::string> value, datatype, lang;

                    // Until the solution is closed with a close curly,
                    while (!regex_search(from, to, what, close, flags)) {

                        // Look for the four binding specifiers.
                        const boost::regex type_r("\\A\"type\"[ \\n]*:[ \\n]*\"(uri|bnode|literal|typed-literal)\"[ \\n]*(,[ \\n]*)?");
                        const boost::regex value_r("\\A\"value\"[ \\n]*:[ \\n]*\"((?:[^\\\\\"]|\\\\[\"nrtb])*)\"[ \\n]*(,[ \\n]*)?");
                        const boost::regex dtype_r("\\A\"datatype\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?");
                        const boost::regex lang_r("\\A\"xml:lang\"[ \\n]*:[ \\n]*\"([^\"]+)\"[ \\n]*(,[ \\n]*)?");

                        /* Match and validate each directive.
                         */
                        if (regex_search(from, to, what, type_r, flags)) {
                            if (type != type_UNSET) {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string() + "\"" + sample
                                                         + "\" is a duplicate type directive.");
                            }
                            std::string typeStr(unescape(what[1].first, what[1].second));

                            // Assign type enum per parsed type directive.
                            if (typeStr == "uri") type = type_URI;
                            else if (typeStr == "bnode") type = type_BNODE;
                            else if (typeStr == "literal") type = type_LITERAL;
                            else if (typeStr == "typed-literal") type = type_TYPEDLITERAL;
                            else {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string() + "unknown SPARQL JSON type \""
                                                         + typeStr + "\" in \"" + sample + "\".");
                            }

                            // Check for compatibility with already parsed directives
                            if (datatype.is_initialized() && type != type_TYPEDLITERAL) {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string()
                                                         + "datatype directive only permitted for typed-literal at \""
                                                         + sample + "\".");
                            }
                            if (lang.is_initialized() && type != type_LITERAL) {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string()
                                                         + "xml:lang directive only permitted for literal at \""
                                                         + sample + "\".");
                            }

                        } else if (regex_search(from, to, what, value_r, flags)) {
                            if (value.is_initialized()) {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string() + "\"" + sample
                                                         + "\" is a duplicate value directive.");
                            }

                            value = unescape(what[1].first, what[1].second);
                        } else if (regex_search(from, to, what, dtype_r, flags)) {
                            if (datatype.is_initialized()) {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string() + "\"" + sample
                                                         + "\" is a duplicate dtype directive.");
                            }
                            if (type != type_UNSET && type != type_TYPEDLITERAL) {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string()
                                                         + "datatype directive only permitted for typed-literal at \""
                                                         + sample + "\".");
                            }

                            datatype = unescape(what[1].first, what[1].second);
                        } else if (regex_search(from, to, what, lang_r, flags)) {
                            if (lang.is_initialized()) {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string() + "\"" + sample
                                                         + "\" is a duplicate lang directive.");
                            }
                            if (type != type_UNSET && type != type_LITERAL) {
                                std::string sample(what[0].first, what[0].first + 80);
                                throw std::runtime_error(std::string()
                                                         + "xml:lang directive only permitted for literal at \""
                                                         + sample + "\".");
                            }

                            lang = unescape(what[1].first, what[1].second);
                        } else {
                            std::string sample(what[0].first, what[0].first + 80);
                            throw std::runtime_error(std::string() + "\"" + sample
                                                     + "\" isn't a type, value, datatype or xml:lang directive.");
                        }
                        from = what[0].second;
                    }
                    from = what[0].second; // get past the close pattern

                    if (!value.is_initialized())
                        throw std::runtime_error(std::string() + "no value set for variable \"" + var + "\".");

                    if (varNo++ > 0)
                        std::cout << ", ";
                    std::cout << "?" << var + "→";
                    switch (type) {
                    case type_UNSET:
                        throw std::runtime_error(std::string() + "no node type set for variable \"" + var + "\".");
                    case type_URI:
                        std::cout << "<" + stringToCurlyNotation(*value) + ">";
                        break;
                    case type_BNODE:
                        std::cout << "_:" + stringToCurlyNotation(*value);
                        break;
                    case type_LITERAL:
                        std::cout << "\"" + stringToCurlyNotation(*value) + "\"";
                        if (lang.is_initialized())
                            std::cout << "@" << *lang;
                        break;
                    case type_TYPEDLITERAL:
                        if (!datatype.is_initialized())
                            throw std::runtime_error(std::string() + "no xml:type set for variable \"" + var + "\".");
                        std::cout << "\"" + stringToCurlyNotation(*value) + "\"^^<" + *datatype + ">";
                        break;
                    }
#endif /* !ORDERED */
                }
                regex_search(from, to, what, close, flags);
                from = what[0].second; // get past the close pattern
                std::cout << "}\n";
            }
        }

        /* Make sure we didn't get lost.
         */
        expect(boost::regex("\\A\\][ \\n]*\\}[ \\n]*\\}[ \\n]*"));
        if (from != to) {
            std::string rest(from, to);
            throw std::runtime_error(std::string() + "garbage found at end of stream \"" + rest + "\"");
        }
    }

};

int main (int argc, const char* argv[]) {
    try {
        // Test UTF8 handling.
        try {
            ToUTF8 toUTF8;
            std::stringstream ss;
            toUTF8(0x98df, std::ostreambuf_iterator<char>(ss)); // Shoku (food)
            assert (ss.str() == "食");
            toUTF8(0x10ffff, std::ostreambuf_iterator<char>(ss)); // just fits.
            toUTF8(0x110000, std::ostreambuf_iterator<char>(ss)); // too large.
            assert (false);
        } catch (ToUTF8::InvalidUCScode& ex) {
            assert (ex.code == 0x110000); // expected exception.
        } catch (...) {
            assert (false); // unexpected exception.
        }

        // Test JSON escape parsing.
        JSONstringToUTF8 unescape;
        assert (unescape(std::string("hello \\\"world\\\"\\nab\\0022\\98dfcd")) == "hello \"world\"\nab\"食cd");

        // Parse some JSON.
        std::ifstream ifs("asdf.json", std::ios::binary);
        if (!ifs.is_open())
            throw std::runtime_error("couldn't open asdf.json");
        JSONparser p;
        p(ifs);
    } catch (std::runtime_error const& e) {
        std::cerr << "std::runtime_error: " << e.what() << "\n";
    } catch (std::string const& e) {
        std::cerr << "std::string: " << e << "\n";
    } catch (...) {
        std::cerr << "˙buoɹʍ ʎɹǝʌ ʇuǝʍ buıɥʇǝɯos\n";
    }
    return 0;
}