Implementation of UTF-8 bytes to chars

author Matthieu Wipliez <[email protected]>

Sun, 17 May 2020 21:37:41 +0000 (23:37 +0200)

committer Matthieu Wipliez <[email protected]>

Fri, 22 May 2020 21:33:12 +0000 (23:33 +0200)
author Matthieu Wipliez <[email protected]>
Sun, 17 May 2020 21:37:41 +0000 (23:37 +0200)
committer Matthieu Wipliez <[email protected]>
Fri, 22 May 2020 21:33:12 +0000 (23:33 +0200)
diff --git a/src/prolog/examples/utf8.pl b/src/prolog/examples/utf8.pl

index 4b452198788b93c66cb7a4b6b06135556517bd70..bee6b2b24f4d1c3851ab19332aebfd8226c8913e 100644 (file)
--- a/src/prolog/examples/utf8.pl
+++ b/src/prolog/examples/utf8.pl
@@ -5,7 +5,9 @@
  
  unit_test :-
    chars_utf8bytes("a£\x2124\", Bs),
-  Bs = [97, 194, 163, 226, 132, 164].
+  Bs = [97, 194, 163, 226, 132, 164],
+  chars_utf8bytes(Cs, Bs),
+  Cs = "a£\x2124\".
  
  write_f :-
    open('x.txt', write, Stream, [type(binary)]),
@@ -13,3 +15,17 @@ write_f :-
    chars_utf8bytes("£\x2124\\x2764\\x1F496\\n", Bs),
    maplist(F, Bs),
    close(Stream).
+
+get_bytes(Stream, Res) :- get_bytes(Stream, [], Res).
+get_bytes(Stream, Acc, Res) :-
+  get_byte(Stream, B),
+  (B =:= -1 ->
+    reverse(Acc, Res)
+  ; get_bytes(Stream, [B|Acc], Res)).
+
+read_f :-
+  open('x.txt', read, Stream, [type(binary)]),
+  get_bytes(Stream, Bs),
+  chars_utf8bytes(Cs, Bs),
+  write(Cs),
+  close(Stream).
diff --git a/src/prolog/lib/charsio.pl b/src/prolog/lib/charsio.pl

index 023ccbadc74ebe7a79db1fe1c5eb71318fa1a4f6..e57a40aaa0f9b5706b091e65eae1b24fe6e2cc83 100644 (file)
--- a/src/prolog/lib/charsio.pl
+++ b/src/prolog/lib/charsio.pl
@@ -140,7 +140,6 @@ write_term_to_chars(Term, Options, Chars) :-
      '$write_term_to_chars'(Chars, Term, IgnoreOps, NumberVars, Quoted, NewVarNames, MaxDepth).
  
  % Encodes Ch character to list of Bytes.
-% TODO: if Ch is variable, decode Bytes to Char.
  char_utf8bytes(Ch, Bytes) :-
    char_code(Ch, Code),
    phrase(code_to_utf8(Code), Bytes).
@@ -155,10 +154,31 @@ encode(Code, Prefix, Nb) -->
    { Nb1 is Nb - 1, Byte is Prefix \/ ((Code >> (6 * Nb1)) /\ 0x3F) },
    [Byte], encode(Code, 0x80, Nb1).
  
-% Encodes a list of characters Cs to a list of UTF-8 bytes Bs.
-% TODO: if Cs is variable, decode bytes to chars instead. 
+% Maps characters and UTF-8 bytes.
+% If Cs is a variable, parses Bs as a list of UTF-8 bytes.
+% Otherwise, transform the list of characters Cs to UTF-8 bytes.
  chars_utf8bytes(Cs, Bs) :-
-  must_be(list, Cs),
-  maplist(must_be(atom), Cs),
-  maplist(char_utf8bytes, Cs, Bss),
-  append(Bss, Bs).
+  var(Cs), must_be(list, Bs) ->
+    once(phrase(decode_utf8(Cs), Bs))
+  ; (must_be(list, Cs),
+     maplist(must_be(atom), Cs),
+     maplist(char_utf8bytes, Cs, Bss),
+     append(Bss, Bs)).
+
+decode_utf8([]) --> [].
+decode_utf8(Chars) --> leading(Nb, Code), continuation(Code, Chars, Nb).
+
+leading(1, Byte) --> [Byte], {Byte /\ 0x80 =:= 0}.
+leading(2, Code) --> [Byte], {Byte /\ 0xE0 =:= 0xC0, Code is Byte - 0xC0}.
+leading(3, Code) --> [Byte], {Byte /\ 0xF0 =:= 0xE0, Code is Byte - 0xE0}.
+leading(4, Code) --> [Byte], {Byte /\ 0xF8 =:= 0xF0, Code is Byte - 0xF0}.
+leading(1, 0xFFFD) --> [_]. % invalid first byte
+
+continuation(Code, [H|T], 1) --> {char_code(H, Code)}, decode_utf8(T).
+continuation(Code, Chars, Nb) --> [Byte],
+  {Nb1 is Nb - 1, Byte /\ 0xC0 =:= 0x80, NextCode is (Code << 6) \/ (Byte - 0x80)},
+  continuation(NextCode, Chars, Nb1).
+
+% invalid continuation byte
+% each remaining continuation byte (if any) will raise 0xFFFD too
+continuation(_, ['\xFFFD\'|T], _) --> [_], decode_utf8(T).
author	Matthieu Wipliez <[email protected]>
	Sun, 17 May 2020 21:37:41 +0000 (23:37 +0200)
committer	Matthieu Wipliez <[email protected]>
	Fri, 22 May 2020 21:33:12 +0000 (23:33 +0200)
src/prolog/examples/utf8.pl		patch \| blob \| history
src/prolog/lib/charsio.pl		patch \| blob \| history