From: Matthieu Wipliez Date: Fri, 15 May 2020 16:05:04 +0000 (+0200) Subject: Work in progress: add char_utf8bytes (#493) X-Git-Tag: v0.8.123~30^2~1 X-Git-Url: https://git.sagredo.dev/?a=commitdiff_plain;h=df06d4b9a2d073049d745281141b822b52cf8bc9;p=scryer-prolog.git Work in progress: add char_utf8bytes (#493) * Add char_utf8bytes to library * Improved implementation * Improve code + add chars predicate * Update example * Renamed string_utf8bytes to char_utf8bytes --- diff --git a/src/prolog/examples/utf8.pl b/src/prolog/examples/utf8.pl new file mode 100644 index 00000000..4b452198 --- /dev/null +++ b/src/prolog/examples/utf8.pl @@ -0,0 +1,15 @@ +:- use_module(library(charsio)). +:- use_module(library(lists)). + +:- initialization(unit_test). + +unit_test :- + chars_utf8bytes("a£\x2124\", Bs), + Bs = [97, 194, 163, 226, 132, 164]. + +write_f :- + open('x.txt', write, Stream, [type(binary)]), + F = put_byte(Stream), + chars_utf8bytes("£\x2124\\x2764\\x1F496\\n", Bs), + maplist(F, Bs), + close(Stream). diff --git a/src/prolog/lib/charsio.pl b/src/prolog/lib/charsio.pl index 6e5a4e93..023ccbad 100644 --- a/src/prolog/lib/charsio.pl +++ b/src/prolog/lib/charsio.pl @@ -1,10 +1,13 @@ -:- module(charsio, [char_type/2, get_single_char/1, +:- module(charsio, [char_type/2, + chars_utf8bytes/2, + get_single_char/1, read_term_from_chars/2, write_term_to_chars/3]). +:- use_module(library(dcgs)). :- use_module(library(iso_ext)). :- use_module(library(error)). -:- use_module(library(lists), [append/3]). +:- use_module(library(lists)). fabricate_var_name(VarType, VarName, N) :- char_code('A', AC), @@ -135,3 +138,27 @@ write_term_to_chars(Term, Options, Chars) :- term_variables(Term, Vars), extend_var_list(Vars, VNNames, NewVarNames, numbervars), '$write_term_to_chars'(Chars, Term, IgnoreOps, NumberVars, Quoted, NewVarNames, MaxDepth). + +% Encodes Ch character to list of Bytes. +% TODO: if Ch is variable, decode Bytes to Char. +char_utf8bytes(Ch, Bytes) :- + char_code(Ch, Code), + phrase(code_to_utf8(Code), Bytes). + +code_to_utf8(Code) --> {Code @< 0x80}, [Code], !. +code_to_utf8(Code) --> {Code @< 0x800}, encode(Code, 0xC0, 2), !. +code_to_utf8(Code) --> {Code @< 0x10000}, encode(Code, 0xE0, 3), !. +code_to_utf8(Code) --> {Code @< 0x110000}, encode(Code, 0xF0, 4), !. + +encode(_, _, 0) --> !. +encode(Code, Prefix, Nb) --> + { Nb1 is Nb - 1, Byte is Prefix \/ ((Code >> (6 * Nb1)) /\ 0x3F) }, + [Byte], encode(Code, 0x80, Nb1). + +% Encodes a list of characters Cs to a list of UTF-8 bytes Bs. +% TODO: if Cs is variable, decode bytes to chars instead. +chars_utf8bytes(Cs, Bs) :- + must_be(list, Cs), + maplist(must_be(atom), Cs), + maplist(char_utf8bytes, Cs, Bss), + append(Bss, Bs).