]> Repositorios git - scryer-prolog.git/commitdiff
Work in progress: add char_utf8bytes (#493)
authorMatthieu Wipliez <[email protected]>
Fri, 15 May 2020 16:05:04 +0000 (18:05 +0200)
committerGitHub <[email protected]>
Fri, 15 May 2020 16:05:04 +0000 (10:05 -0600)
* Add char_utf8bytes to library

* Improved implementation

* Improve code + add chars predicate

* Update example

* Renamed string_utf8bytes to char_utf8bytes

src/prolog/examples/utf8.pl [new file with mode: 0644]
src/prolog/lib/charsio.pl

diff --git a/src/prolog/examples/utf8.pl b/src/prolog/examples/utf8.pl
new file mode 100644 (file)
index 0000000..4b45219
--- /dev/null
@@ -0,0 +1,15 @@
+:- use_module(library(charsio)).
+:- use_module(library(lists)).
+
+:- initialization(unit_test).
+
+unit_test :-
+  chars_utf8bytes("a£\x2124\", Bs),
+  Bs = [97, 194, 163, 226, 132, 164].
+
+write_f :-
+  open('x.txt', write, Stream, [type(binary)]),
+  F = put_byte(Stream),
+  chars_utf8bytes("£\x2124\\x2764\\x1F496\\n", Bs),
+  maplist(F, Bs),
+  close(Stream).
index 6e5a4e933c1b069eaccc5ac1309a90727c7a5158..023ccbadc74ebe7a79db1fe1c5eb71318fa1a4f6 100644 (file)
@@ -1,10 +1,13 @@
-:- module(charsio, [char_type/2, get_single_char/1,
+:- module(charsio, [char_type/2,
+                    chars_utf8bytes/2,
+                    get_single_char/1,
                     read_term_from_chars/2,
                     write_term_to_chars/3]).
 
+:- use_module(library(dcgs)).
 :- use_module(library(iso_ext)).
 :- use_module(library(error)).
-:- use_module(library(lists), [append/3]).
+:- use_module(library(lists)).
 
 fabricate_var_name(VarType, VarName, N) :-
     char_code('A', AC),
@@ -135,3 +138,27 @@ write_term_to_chars(Term, Options, Chars) :-
     term_variables(Term, Vars),
     extend_var_list(Vars, VNNames, NewVarNames, numbervars),
     '$write_term_to_chars'(Chars, Term, IgnoreOps, NumberVars, Quoted, NewVarNames, MaxDepth).
+
+% Encodes Ch character to list of Bytes.
+% TODO: if Ch is variable, decode Bytes to Char.
+char_utf8bytes(Ch, Bytes) :-
+  char_code(Ch, Code),
+  phrase(code_to_utf8(Code), Bytes).
+
+code_to_utf8(Code) --> {Code @< 0x80},     [Code], !.
+code_to_utf8(Code) --> {Code @< 0x800},    encode(Code, 0xC0, 2), !.
+code_to_utf8(Code) --> {Code @< 0x10000},  encode(Code, 0xE0, 3), !.
+code_to_utf8(Code) --> {Code @< 0x110000}, encode(Code, 0xF0, 4), !.
+
+encode(_, _, 0) --> !.
+encode(Code, Prefix, Nb) -->
+  { Nb1 is Nb - 1, Byte is Prefix \/ ((Code >> (6 * Nb1)) /\ 0x3F) },
+  [Byte], encode(Code, 0x80, Nb1).
+
+% Encodes a list of characters Cs to a list of UTF-8 bytes Bs.
+% TODO: if Cs is variable, decode bytes to chars instead. 
+chars_utf8bytes(Cs, Bs) :-
+  must_be(list, Cs),
+  maplist(must_be(atom), Cs),
+  maplist(char_utf8bytes, Cs, Bss),
+  append(Bss, Bs).