From: Matthieu Wipliez <matthieu.wipliez@gmail.com>
Date: Sun, 17 May 2020 21:37:41 +0000 (+0200)
Subject: Implementation of UTF-8 bytes to chars
X-Git-Tag: v0.8.127~63^2
X-Git-Url: https://git.sagredo.dev/?a=commitdiff_plain;h=cc9d6587a35adf732dab7df9d78c3ee9d6ab2762;p=scryer-prolog.git

Implementation of UTF-8 bytes to chars
---

diff --git a/src/prolog/examples/utf8.pl b/src/prolog/examples/utf8.pl
index 4b452198..bee6b2b2 100644
--- a/src/prolog/examples/utf8.pl
+++ b/src/prolog/examples/utf8.pl
@@ -5,7 +5,9 @@
 
 unit_test :-
   chars_utf8bytes("aÂ£\x2124\", Bs),
-  Bs = [97, 194, 163, 226, 132, 164].
+  Bs = [97, 194, 163, 226, 132, 164],
+  chars_utf8bytes(Cs, Bs),
+  Cs = "aÂ£\x2124\".
 
 write_f :-
   open('x.txt', write, Stream, [type(binary)]),
@@ -13,3 +15,17 @@ write_f :-
   chars_utf8bytes("Â£\x2124\\x2764\\x1F496\\n", Bs),
   maplist(F, Bs),
   close(Stream).
+
+get_bytes(Stream, Res) :- get_bytes(Stream, [], Res).
+get_bytes(Stream, Acc, Res) :-
+  get_byte(Stream, B),
+  (B =:= -1 ->
+    reverse(Acc, Res)
+  ; get_bytes(Stream, [B|Acc], Res)).
+
+read_f :-
+  open('x.txt', read, Stream, [type(binary)]),
+  get_bytes(Stream, Bs),
+  chars_utf8bytes(Cs, Bs),
+  write(Cs),
+  close(Stream).
diff --git a/src/prolog/lib/charsio.pl b/src/prolog/lib/charsio.pl
index 023ccbad..e57a40aa 100644
--- a/src/prolog/lib/charsio.pl
+++ b/src/prolog/lib/charsio.pl
@@ -140,7 +140,6 @@ write_term_to_chars(Term, Options, Chars) :-
     '$write_term_to_chars'(Chars, Term, IgnoreOps, NumberVars, Quoted, NewVarNames, MaxDepth).
 
 % Encodes Ch character to list of Bytes.
-% TODO: if Ch is variable, decode Bytes to Char.
 char_utf8bytes(Ch, Bytes) :-
   char_code(Ch, Code),
   phrase(code_to_utf8(Code), Bytes).
@@ -155,10 +154,31 @@ encode(Code, Prefix, Nb) -->
   { Nb1 is Nb - 1, Byte is Prefix \/ ((Code >> (6 * Nb1)) /\ 0x3F) },
   [Byte], encode(Code, 0x80, Nb1).
 
-% Encodes a list of characters Cs to a list of UTF-8 bytes Bs.
-% TODO: if Cs is variable, decode bytes to chars instead. 
+% Maps characters and UTF-8 bytes.
+% If Cs is a variable, parses Bs as a list of UTF-8 bytes.
+% Otherwise, transform the list of characters Cs to UTF-8 bytes.
 chars_utf8bytes(Cs, Bs) :-
-  must_be(list, Cs),
-  maplist(must_be(atom), Cs),
-  maplist(char_utf8bytes, Cs, Bss),
-  append(Bss, Bs).
+  var(Cs), must_be(list, Bs) ->
+    once(phrase(decode_utf8(Cs), Bs))
+  ; (must_be(list, Cs),
+     maplist(must_be(atom), Cs),
+     maplist(char_utf8bytes, Cs, Bss),
+     append(Bss, Bs)).
+
+decode_utf8([]) --> [].
+decode_utf8(Chars) --> leading(Nb, Code), continuation(Code, Chars, Nb).
+
+leading(1, Byte) --> [Byte], {Byte /\ 0x80 =:= 0}.
+leading(2, Code) --> [Byte], {Byte /\ 0xE0 =:= 0xC0, Code is Byte - 0xC0}.
+leading(3, Code) --> [Byte], {Byte /\ 0xF0 =:= 0xE0, Code is Byte - 0xE0}.
+leading(4, Code) --> [Byte], {Byte /\ 0xF8 =:= 0xF0, Code is Byte - 0xF0}.
+leading(1, 0xFFFD) --> [_]. % invalid first byte
+
+continuation(Code, [H|T], 1) --> {char_code(H, Code)}, decode_utf8(T).
+continuation(Code, Chars, Nb) --> [Byte],
+  {Nb1 is Nb - 1, Byte /\ 0xC0 =:= 0x80, NextCode is (Code << 6) \/ (Byte - 0x80)},
+  continuation(NextCode, Chars, Nb1).
+
+% invalid continuation byte
+% each remaining continuation byte (if any) will raise 0xFFFD too
+continuation(_, ['\xFFFD\'|T], _) --> [_], decode_utf8(T).