From: Markus Triska Date: Thu, 11 Jun 2020 13:29:06 +0000 (+0200) Subject: ENHANCED: phrase_from_file/[2,3] now read the file lazily. X-Git-Tag: v0.8.127~39^2^2~1 X-Git-Url: https://git.sagredo.dev/?a=commitdiff_plain;h=40c3e31955598bb37e516b777ca30c6d6ac2ad93;p=scryer-prolog.git ENHANCED: phrase_from_file/[2,3] now read the file lazily. This allows processing extremely large files. In addition, the efficient string representation helps to reduce memory usage. These features unleash the full power of Prolog for text processing, the exact use case Prolog was designed for. --- diff --git a/src/prolog/clause_types.rs b/src/prolog/clause_types.rs index a6257082..9b013dbc 100644 --- a/src/prolog/clause_types.rs +++ b/src/prolog/clause_types.rs @@ -180,7 +180,6 @@ pub enum SystemClauseType { ExpandTerm, FetchGlobalVar, FetchGlobalVarWithOffset, - FileToChars, FirstStream, FlushOutput, GetByte, @@ -350,7 +349,6 @@ impl SystemClauseType { &SystemClauseType::FetchGlobalVarWithOffset => { clause_name!("$fetch_global_var_with_offset") } - &SystemClauseType::FileToChars => clause_name!("$file_to_chars"), &SystemClauseType::FirstStream => clause_name!("$first_stream"), &SystemClauseType::FlushOutput => clause_name!("$flush_output"), &SystemClauseType::GetByte => clause_name!("$get_byte"), @@ -549,7 +547,6 @@ impl SystemClauseType { ("$expand_goal", 2) => Some(SystemClauseType::ExpandGoal), ("$fetch_global_var", 2) => Some(SystemClauseType::FetchGlobalVar), ("$fetch_global_var_with_offset", 3) => Some(SystemClauseType::FetchGlobalVarWithOffset), - ("$file_to_chars", 3) => Some(SystemClauseType::FileToChars), ("$get_byte", 2) => Some(SystemClauseType::GetByte), ("$get_char", 2) => Some(SystemClauseType::GetChar), ("$get_code", 2) => Some(SystemClauseType::GetCode), diff --git a/src/prolog/lib/pio.pl b/src/prolog/lib/pio.pl index 4c0f9046..4132752b 100644 --- a/src/prolog/lib/pio.pl +++ b/src/prolog/lib/pio.pl @@ -3,6 +3,8 @@ :- use_module(library(dcgs)). :- use_module(library(error)). +:- use_module(library(freeze)). +:- use_module(library(iso_ext), [setup_call_cleanup/3, partial_string/3]). :- use_module(library(lists), [member/2]). phrase_from_file(NT, File) :- @@ -19,6 +21,68 @@ phrase_from_file(NT, File, Options) :- member(Type, [text,binary]) ; Type = text ), - '$file_to_chars'(File, Chars, Type), - phrase(NT, Chars) - ). + setup_call_cleanup(open(File, read, Stream, [reposition(true)|Options]), + ( stream_to_lazy_list(pio:Type, Stream, Xs), + phrase(NT, Xs) ), + close(Stream)) + ). + + +stream_to_lazy_list(Type_3, Stream, Xs) :- + stream_property(Stream, position(Pos)), + freeze(Xs, reader_step(Type_3, Stream, Pos, Xs)). + +reader_step(Type_3, Stream, Pos, Xs0) :- + set_stream_position(Stream, Pos), + ( at_end_of_stream(Stream) + -> Xs0 = [] + ; % phrase(call(call(Type_3,Stream)), Xs0,Xs), % conforming call + call(Type_3, Stream, Cs,[]), % effective call + partial_string(Cs, Xs0, Xs), + stream_to_lazy_list(Type_3, Stream, Xs) + ). + +binary(Stream, Xs0, Xs) :- get_pending_bytes(Stream, Xs0, Xs). +text(Stream, Xs0, Xs) :- get_pending_chars(Stream, Xs0, Xs). + + +get_pending_chars(Stream, Chs0,Chs) :- + n_get_chars(4096, Stream, Chs, Chs0,Chs). + +% EOF means: If EOF == [], then EOF has definitely been reached, otherwise +% it is unknown and the argument remains uninstantiated. + +% To improve performance, the following predicates should be replaced +% by a fast Rust implementation that reads a number of characters (or +% bytes) at once. + +% Files that do not contain 0-bytes can even be mmapped to memory. + +n_get_chars(N0, Stream, EOF, Chs0,Chs) :- + N0 > 0, + N1 is N0-1, + get_char(Stream, Ch), + ( Ch == end_of_file + -> Chs0 = Chs, + EOF = [] + ; Chs0 = [Ch|Chs1], + n_get_chars(N1, Stream, EOF, Chs1,Chs) + ). +n_get_chars(0, _, _, Chs,Chs). + + +get_pending_bytes(Stream, Chs0,Chs) :- + n_get_bytes(4096, Stream, Chs, Chs0,Chs). + +n_get_bytes(N0, Stream, EOF, Chs0,Chs) :- + N0 > 0, + N1 is N0-1, + get_byte(Stream, Byte), + ( Byte == -1 + -> Chs0 = Chs, + EOF = [] + ; char_code(Ch, Byte), + Chs0 = [Ch|Chs1], + n_get_bytes(N1, Stream, EOF, Chs1,Chs) + ). +n_get_bytes(0, _, _, Chs,Chs). diff --git a/src/prolog/machine/system_calls.rs b/src/prolog/machine/system_calls.rs index d4607c59..e6867c99 100644 --- a/src/prolog/machine/system_calls.rs +++ b/src/prolog/machine/system_calls.rs @@ -27,7 +27,7 @@ use std::collections::BTreeSet; use std::convert::TryFrom; use std::io::{ErrorKind, Read, Write}; use std::iter::{once, FromIterator}; -use std::fs::{File, OpenOptions}; +use std::fs::{OpenOptions}; use std::net::{TcpListener, TcpStream}; use std::ops::Sub; use std::rc::Rc; @@ -1820,109 +1820,6 @@ impl MachineState { } }; } - &SystemClauseType::FileToChars => { - // TODO: Replace this with stream. - let a1 = self.store(self.deref(self[temp_v!(1)])); - let a2 = self.store(self.deref(self[temp_v!(2)])); - - let file_name = match a1 { - Addr::Con(h) if self.heap.atom_at(h) => { - if let HeapCellValue::Atom(name, _) = &self.heap[h] { - name.clone() - } - else { - unreachable!() - } - } - Addr::Char(c) => { - clause_name!(c.to_string(), indices.atom_tbl.clone()) - } - _ => { - unreachable!() - } - }; - - let name = clause_name!("$file_to_chars"); - let mut file = match File::open(file_name.as_str()) { - Ok(f) => f, - Err(e) => { - let arity = 2; - let stub = MachineError::functor_stub(name.clone(), arity); - let h = self.heap.h(); - - let err = match e.kind() { - ErrorKind::NotFound => { - MachineError::existence_error( - h, - ExistenceError::ModuleSource( - ModuleSource::File(file_name) - ), - ) - } - ErrorKind::PermissionDenied => { - let source_sink = self.store(self.deref(a1)); - - MachineError::permission_error( - h, - Permission::Access, - "source_sink", - source_sink - ) - } - _ => unreachable!() // Not nice. - }; - - return Err(self.error_form(err, stub)); - } - }; - - - let type_str = match self.store(self.deref(self[temp_v!(3)])) { - Addr::Con(h) if self.heap.atom_at(h) => { - if let HeapCellValue::Atom(ref atom, _) = &self.heap[h] { - atom.as_str() - } else { - unreachable!() - } - } - _ => { - unreachable!() - } - }; - - let complete_string = { - let mut buffer = String::new(); - match type_str { - "text" => { match file.read_to_string(&mut buffer) { - Ok(_size) => { - self.heap.put_complete_string(&buffer) - } - Err(_e) => { - // the data isn't valid UTF-8, so we fail. - self.fail = true; - return Ok(()); - - } - } - } - "binary" => { let mut buffer = Vec::new(); - let _ = match file.read_to_end(&mut buffer) { - Ok(size) => size, - Err(_e) => unreachable!() - }; - - let buffer = String::from_iter( - buffer.into_iter().map(|b| b as char) - ); - - self.heap.put_complete_string(&buffer) - } - _ => { unreachable!() } - } - }; - - self.unify(complete_string, a2); - } &SystemClauseType::PutCode => { let mut stream = self.get_stream_or_alias(self[temp_v!(1)], indices, "put_code", 2)?;