From: Matthieu Wipliez Date: Sun, 17 May 2020 21:37:41 +0000 (+0200) Subject: Implementation of UTF-8 bytes to chars X-Git-Tag: v0.8.127~63^2 X-Git-Url: https://git.sagredo.dev/?a=commitdiff_plain;h=cc9d6587a35adf732dab7df9d78c3ee9d6ab2762;p=scryer-prolog.git Implementation of UTF-8 bytes to chars --- diff --git a/src/prolog/examples/utf8.pl b/src/prolog/examples/utf8.pl index 4b452198..bee6b2b2 100644 --- a/src/prolog/examples/utf8.pl +++ b/src/prolog/examples/utf8.pl @@ -5,7 +5,9 @@ unit_test :- chars_utf8bytes("a£\x2124\", Bs), - Bs = [97, 194, 163, 226, 132, 164]. + Bs = [97, 194, 163, 226, 132, 164], + chars_utf8bytes(Cs, Bs), + Cs = "a£\x2124\". write_f :- open('x.txt', write, Stream, [type(binary)]), @@ -13,3 +15,17 @@ write_f :- chars_utf8bytes("£\x2124\\x2764\\x1F496\\n", Bs), maplist(F, Bs), close(Stream). + +get_bytes(Stream, Res) :- get_bytes(Stream, [], Res). +get_bytes(Stream, Acc, Res) :- + get_byte(Stream, B), + (B =:= -1 -> + reverse(Acc, Res) + ; get_bytes(Stream, [B|Acc], Res)). + +read_f :- + open('x.txt', read, Stream, [type(binary)]), + get_bytes(Stream, Bs), + chars_utf8bytes(Cs, Bs), + write(Cs), + close(Stream). diff --git a/src/prolog/lib/charsio.pl b/src/prolog/lib/charsio.pl index 023ccbad..e57a40aa 100644 --- a/src/prolog/lib/charsio.pl +++ b/src/prolog/lib/charsio.pl @@ -140,7 +140,6 @@ write_term_to_chars(Term, Options, Chars) :- '$write_term_to_chars'(Chars, Term, IgnoreOps, NumberVars, Quoted, NewVarNames, MaxDepth). % Encodes Ch character to list of Bytes. -% TODO: if Ch is variable, decode Bytes to Char. char_utf8bytes(Ch, Bytes) :- char_code(Ch, Code), phrase(code_to_utf8(Code), Bytes). @@ -155,10 +154,31 @@ encode(Code, Prefix, Nb) --> { Nb1 is Nb - 1, Byte is Prefix \/ ((Code >> (6 * Nb1)) /\ 0x3F) }, [Byte], encode(Code, 0x80, Nb1). -% Encodes a list of characters Cs to a list of UTF-8 bytes Bs. -% TODO: if Cs is variable, decode bytes to chars instead. +% Maps characters and UTF-8 bytes. +% If Cs is a variable, parses Bs as a list of UTF-8 bytes. +% Otherwise, transform the list of characters Cs to UTF-8 bytes. chars_utf8bytes(Cs, Bs) :- - must_be(list, Cs), - maplist(must_be(atom), Cs), - maplist(char_utf8bytes, Cs, Bss), - append(Bss, Bs). + var(Cs), must_be(list, Bs) -> + once(phrase(decode_utf8(Cs), Bs)) + ; (must_be(list, Cs), + maplist(must_be(atom), Cs), + maplist(char_utf8bytes, Cs, Bss), + append(Bss, Bs)). + +decode_utf8([]) --> []. +decode_utf8(Chars) --> leading(Nb, Code), continuation(Code, Chars, Nb). + +leading(1, Byte) --> [Byte], {Byte /\ 0x80 =:= 0}. +leading(2, Code) --> [Byte], {Byte /\ 0xE0 =:= 0xC0, Code is Byte - 0xC0}. +leading(3, Code) --> [Byte], {Byte /\ 0xF0 =:= 0xE0, Code is Byte - 0xE0}. +leading(4, Code) --> [Byte], {Byte /\ 0xF8 =:= 0xF0, Code is Byte - 0xF0}. +leading(1, 0xFFFD) --> [_]. % invalid first byte + +continuation(Code, [H|T], 1) --> {char_code(H, Code)}, decode_utf8(T). +continuation(Code, Chars, Nb) --> [Byte], + {Nb1 is Nb - 1, Byte /\ 0xC0 =:= 0x80, NextCode is (Code << 6) \/ (Byte - 0x80)}, + continuation(NextCode, Chars, Nb1). + +% invalid continuation byte +% each remaining continuation byte (if any) will raise 0xFFFD too +continuation(_, ['\xFFFD\'|T], _) --> [_], decode_utf8(T).