From 0e73b5380376eb81c984b304faf2143a6ee94e88 Mon Sep 17 00:00:00 2001 From: panasenco Date: Sun, 18 Apr 2021 23:21:12 -0700 Subject: [PATCH] Complete reordering and partial rewrite to match the official McKeeman form of the JSON specification --- src/lib/json.pl | 319 +++++++++++++++++++++++++++--------------------- 1 file changed, 183 insertions(+), 136 deletions(-) diff --git a/src/lib/json.pl b/src/lib/json.pl index 4b6915ff..1875d3c7 100644 --- a/src/lib/json.pl +++ b/src/lib/json.pl @@ -34,15 +34,11 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ :- module(json, [ - json_whitespace//0, - json_string//1, - json_number//1, - json_value//1, - json_array//1, - json_object//1 + json_chars//1 ]). :- use_module(library(assoc)). +:- use_module(library(between)). :- use_module(library(charsio)). :- use_module(library(clpz)). :- use_module(library(dcgs)). @@ -51,16 +47,72 @@ :- use_module(library(lists)). :- use_module(library(reif)). -char_uniontypes(Char, Types) :- - must_be(list, Types), - bagof(Type, (char_type(Char, Type), member(Type, Types)), [_|_]). +/* The DCGs are written to match the McKeeman Form presented on the right side of https://www.json.org/json-en.html + almost perfectly. Note that the McKeeman form conflicts some with the pictures on the left side. */ +json_chars(Internal) --> json_element(Internal). -json_whitespace --> "". -json_whitespace --> " ", json_whitespace. -json_whitespace --> "\n", json_whitespace. -json_whitespace --> "\r", json_whitespace. -json_whitespace --> "\t", json_whitespace. +/* Because it's impossible to distinguish between an empty array [] and an empty string "", we distinguish between + different types of values based on their principal functor. The principal functors match the types defined in + the JSON Schema spec here: https://json-schema.org/draft/2020-12/json-schema-validation.html#rfc.section.6.1.1 + Down the line we'll incorporate more JSON Schema support, but this is it for now. */ +json_value(object(Assoc)) --> json_object(Assoc). +json_value(array(List)) --> json_array(List). +json_value(string(Chars)) --> json_string(Chars). +json_value(number(Number)) --> json_number(Number). +json_value(boolean(true)) --> "true". +json_value(boolean(false)) --> "false". +json_value(null) --> "null". +/* Read Bob Kowalski's "Algorithm = Logic + Control": + https://www.doc.ic.ac.uk/~rak/papers/algorithm%20=%20logic%20+%20control.pdf + This DCG definition does two things: + 1. Logic: Relate an association list to a JSON object serialized in a string. + 2. Control: Define the exact strategy by which we obtain an association list from a JSON string and vice versa. + This is done via instantiation checks `var/1` and `nonvar/1`. + Unfortunately, the logic and control in this DCG aren't separated cleanly like Bob Kowalski proposed. + Maybe at some point in the future we'll have a library that takes a pure logic character parsing/generating DCG + and 'injects' control strategy into it. We aren't there yet... */ +json_object(EmptyAssoc) --> {empty_assoc(EmptyAssoc)}, "{", json_ws, "}". +json_object(Assoc) --> + { + nonvar(Assoc) -> + \+ empty_assoc(Assoc), + assoc_to_list(Assoc, [Pair|Pairs]) + ; true + }, + "{", + json_members([Pair|Pairs]), + "}", + { + var(Assoc) -> + list_to_assoc([Pair|Pairs], Assoc) + ; true + }. + +json_members([Key-Value]) --> json_member(Key, Value). +json_members([Key-Value | Pairs]) --> json_member(Key, Value), ",", json_members(Pairs). + +json_member(Key, Value) --> json_ws, json_string(Key), json_ws, ":", json_element(Value). + +json_array([]) --> "[", json_ws, "]". +json_array([Value|Values]) --> "[", json_elements([Value|Values]), "]". + +json_elements([Value]) --> json_element(Value). +json_elements([Value|Values]) --> json_element(Value), ",", json_elements(Values). + +json_element(Value) --> json_ws, json_value(Value), json_ws. + +json_string(Chars) --> "\"", json_characters(Chars), "\"". + +json_characters("") --> "". +json_characters([Char|Chars]) --> json_character(Char), json_characters(Chars). + +/* A directly printable character is defined by the JSON spec as a character between 0020 and 10FFFF except the + escaped characters. + Note that `char_code/2` throws an instantiation error if both its arguments are undefined, so we delay + calling it until we've seen both the generating and the parsing sides of the DCG. + If we moved the block containing `char_code/2` up before `[PrintChar]`, we would still be able to generate JSON, + but attempting to parse JSON would cause an instantiation error. */ escape_map([ '"' - '"', ('\\') - ('\\'), @@ -72,149 +124,144 @@ escape_map([ '\t' - 't' ]). -hex(0) --> "0". -hex(1) --> "1". -hex(2) --> "2". -hex(3) --> "3". -hex(4) --> "4". -hex(5) --> "5". -hex(6) --> "6". -hex(7) --> "7". -hex(8) --> "8". -hex(9) --> "9". -hex(10) --> "a". -hex(11) --> "b". -hex(12) --> "c". -hex(13) --> "d". -hex(14) --> "e". -hex(15) --> "f". - -inner_string("") --> "". -inner_string([PrintChar | Tail]) --> +json_character(PrintChar) --> [PrintChar], { escape_map(EscapeMap), \+ member(PrintChar-_, EscapeMap), - ( - PrintChar = ' ' - ; char_uniontypes(PrintChar, [alphanumeric, ascii_graphic]) - ) - }, - inner_string(Tail). -inner_string([EscapeChar | Tail]) --> - "\\", + char_code(PrintChar, PrintCharCode), + PrintCharCode in 32..1114111 /* 20.10FFFF */ + }. +json_character(EscapeChar) --> "\\", json_escape(EscapeChar). + +json_escape(EscapeChar) --> [PrintChar], { escape_map(EscapeMap), member(EscapeChar-PrintChar, EscapeMap) - }, - inner_string(Tail). -inner_string([NonPrintChar | Tail]) --> - "\\u", - { + }. +json_escape(EscapeChar) --> + "u", + { /* Logic: Define the domain of the escape character as well as the relationship between the escape character + and the four hexes */ [H1, H2, H3, H4] ins 0..15, - NonPrintCharCode in 0..65535, - NonPrintCharCode #= H1 * 16^3 + H2 * 16^2 + H3 * 16 + H4, - ( - ground(NonPrintChar) -> - escape_map(EscapeMap), - \+ member(NonPrintChar-_, EscapeMap), - dif(NonPrintChar, ' '), - \+ char_uniontypes(NonPrintChar, [alphanumeric, ascii_graphic]), - char_code(NonPrintChar, NonPrintCharCode) - ; true - ) + EscapeCharCode in 0..65535, + EscapeCharCode #= H1 * 16^3 + H2 * 16^2 + H3 * 16 + H4 }, - hex(H1), - hex(H2), - hex(H3), - hex(H4), - { - \+ ground(NonPrintChar) -> - char_code(NonPrintChar, NonPrintCharCode) + { /* Control: Get the code of the escape character if we can. Otherwise we'll end up backtracking over 65,536 + possible hex values. + Logic: Only the first 32 Unicode characters not escaped in the escape map are eligible for \u-escaping + when generating. However, we want to be able to parse any of the 65,536 \u-escaped values when parsing. */ + nonvar(EscapeChar) -> + char_code(EscapeChar, EscapeCharCode), + EscapeCharCode in 0..31, + escape_map(EscapeMap), + \+ member(EscapeChar-_, EscapeMap) ; true }, - inner_string(Tail). - -json_string(Inner) --> - "\"", - inner_string(Inner), - "\"". - -posdigit(Digit) --> [Digit], {member(Digit, "123456789")}. -digit('0') --> "0". -digit(Digit) --> posdigit(Digit). -number_str(['-'|Rest], sign) --> "-", number_str(Rest, wholestart). -number_str(Rest, sign) --> number_str(Rest, wholestart). -number_str([PosDigit|Rest], wholestart) --> posdigit(PosDigit), number_str(Rest, wholerest). -number_str(['0'|Rest], wholestart) --> "0", number_str(Rest, fractionstart). -number_str([Digit|Rest], wholerest) --> digit(Digit), number_str(Rest, wholerest). -number_str(Rest, wholerest) --> number_str(Rest, fractionstart). -number_str(Rest, wholerest) --> number_str(Rest, exponentstart). -number_str(['.'|Rest], fractionstart) --> ".", number_str(Rest, fraction). -number_str([Digit|Rest], fraction) --> digit(Digit), number_str(Rest, fraction). -number_str([Digit|Rest], fraction) --> digit(Digit), number_str(Rest, exponentstart). -number_str(['e'|Rest], exponentstart) --> "e", number_str(Rest, exponentsign). -number_str(['e'|Rest], exponentstart) --> "E", number_str(Rest, exponentsign). -number_str("", exponentstart) --> "". -number_str(['-'|Rest], exponentsign) --> "-", number_str(Rest, exponent). -number_str(Rest, exponentsign) --> "+", number_str(Rest, exponent). -number_str(Rest, exponentsign) --> number_str(Rest, exponent). -number_str([Digit|Rest], exponent) --> digit(Digit), number_str(Rest, exponent). -number_str([Digit], exponent) --> digit(Digit). + json_hex(H1), + json_hex(H2), + json_hex(H3), + json_hex(H4), + { /* Control + Logic: Get the escape character atom from the character code computed from the hexes. */ + var(EscapeChar) -> + char_code(EscapeChar, EscapeCharCode) + ; true + }. +json_hex(Digit) --> json_digit(Digit). +json_hex(10) --> "a". +json_hex(11) --> "b". +json_hex(12) --> "c". +json_hex(13) --> "d". +json_hex(14) --> "e". +json_hex(15) --> "f". +json_hex(10) --> "A". +json_hex(11) --> "B". +json_hex(12) --> "C". +json_hex(13) --> "D". +json_hex(14) --> "E". +json_hex(15) --> "F". + +/* Here we are going to write completely different DCGs for parsing and generating, and rely on built-in + predicates. However, the underlying logic remains the same. */ json_number(Number) --> { - ground(Number) -> - ( - number(Number) -> - number_chars(Number, NumberChars) - ; false - ) - ; true + nonvar(Number) -> + number_chars(Number, NumberChars) + ; false }, - number_str(NumberChars, sign), + NumberChars. +json_number(Number) --> + { + var(Number) + }, + json_sign_noplus(Sign), + json_integer(Integer), + json_fraction(Fraction), + json_exponent(Exponent), + { + Number is Sign * (Integer + Fraction) * 10.0 ^ Exponent + }. + +json_integer(Digit) --> json_digit(Digit). +json_integer(TotalValue) --> + json_onenine(FirstDigit), + json_digits(RemainingValue, Power), + { + TotalValue #= FirstDigit * 10 ^ (Power + 1) + RemainingValue + }. + +json_digits(Digit, 0) --> json_digit(Digit). +json_digits(Value, Power) --> + json_digit(FirstDigit), + json_digits(RemainingValue, NextPower), { - ground(Number) -> - true - ; number_chars(Number, NumberChars) + Power #= NextPower + 1, + Value #= FirstDigit * 10^Power + RemainingValue }. -inner_value(string(Chars)) --> json_string(Chars). -inner_value(number(Number)) --> json_number(Number). -inner_value(object(Object)) --> json_object(Object). -inner_value(array(Array)) --> json_array(Array). -inner_value(boolean(true)) --> "true". -inner_value(boolean(false)) --> "false". -inner_value(null) --> "null". -json_value(Value) --> json_whitespace, inner_value(Value), json_whitespace. - -inner_array([]) --> "". -inner_array([Value]) --> json_value(Value). -inner_array([Value1, Value2 | Tail]) --> - json_value(Value1), - ",", - inner_array([Value2 | Tail]). -json_array(List) --> "[", inner_array(List), "]". - -json_member(Key, Value) --> json_whitespace, json_string(Key), json_whitespace, ":", json_value(Value). +json_digit(0) --> "0". +json_digit(Digit) --> json_onenine(Digit). -json_members([Key-Value]) --> json_member(Key, Value). -json_members([Key-Value | Tail]) --> json_member(Key, Value), ",", json_members(Tail). +json_onenine(1) --> "1". +json_onenine(2) --> "2". +json_onenine(3) --> "3". +json_onenine(4) --> "4". +json_onenine(5) --> "5". +json_onenine(6) --> "6". +json_onenine(7) --> "7". +json_onenine(8) --> "8". +json_onenine(9) --> "9". -json_object(EmptyAssoc) --> {empty_assoc(EmptyAssoc)}, "{", json_whitespace, "}". -json_object(Assoc) --> +json_fraction(0) --> "". +json_fraction(Fraction) --> + ".", + json_digits(Value, Power), { - nonvar(Assoc) -> - \+ empty_assoc(Assoc), - assoc_to_list(Assoc, [Pair|Pairs]) - ; true - }, - "{", - json_members([Pair|Pairs]), - "}", + Fraction is Value / 10 ^ (Power + 1) + }. + +json_exponent(0) --> "". +json_exponent(Exponent) --> + json_exponent_signifier, + json_sign(Sign), + json_digits(Value, _), { - var(Assoc) -> - list_to_assoc([Pair|Pairs], Assoc) - ; true + Exponent #= Sign * Value }. + +json_exponent_signifier --> "E". +json_exponent_signifier --> "e". + +json_sign_noplus(1) --> "". +json_sign_noplus(-1) --> "-". + +json_sign(Sign) --> json_sign_noplus(Sign). +json_sign(1) --> "+". + +json_ws --> "". +json_ws --> " ", json_ws. +json_ws --> "\n", json_ws. +json_ws --> "\r", json_ws. +json_ws --> "\t", json_ws. -- 2.54.0