From 40c3e31955598bb37e516b777ca30c6d6ac2ad93 Mon Sep 17 00:00:00 2001 From: Markus Triska Date: Thu, 11 Jun 2020 15:29:06 +0200 Subject: [PATCH] ENHANCED: phrase_from_file/[2,3] now read the file lazily. This allows processing extremely large files. In addition, the efficient string representation helps to reduce memory usage. These features unleash the full power of Prolog for text processing, the exact use case Prolog was designed for. --- src/prolog/clause_types.rs | 3 - src/prolog/lib/pio.pl | 70 ++++++++++++++++++- src/prolog/machine/system_calls.rs | 105 +---------------------------- 3 files changed, 68 insertions(+), 110 deletions(-) diff --git a/src/prolog/clause_types.rs b/src/prolog/clause_types.rs index a6257082..9b013dbc 100644 --- a/src/prolog/clause_types.rs +++ b/src/prolog/clause_types.rs @@ -180,7 +180,6 @@ pub enum SystemClauseType { ExpandTerm, FetchGlobalVar, FetchGlobalVarWithOffset, - FileToChars, FirstStream, FlushOutput, GetByte, @@ -350,7 +349,6 @@ impl SystemClauseType { &SystemClauseType::FetchGlobalVarWithOffset => { clause_name!("$fetch_global_var_with_offset") } - &SystemClauseType::FileToChars => clause_name!("$file_to_chars"), &SystemClauseType::FirstStream => clause_name!("$first_stream"), &SystemClauseType::FlushOutput => clause_name!("$flush_output"), &SystemClauseType::GetByte => clause_name!("$get_byte"), @@ -549,7 +547,6 @@ impl SystemClauseType { ("$expand_goal", 2) => Some(SystemClauseType::ExpandGoal), ("$fetch_global_var", 2) => Some(SystemClauseType::FetchGlobalVar), ("$fetch_global_var_with_offset", 3) => Some(SystemClauseType::FetchGlobalVarWithOffset), - ("$file_to_chars", 3) => Some(SystemClauseType::FileToChars), ("$get_byte", 2) => Some(SystemClauseType::GetByte), ("$get_char", 2) => Some(SystemClauseType::GetChar), ("$get_code", 2) => Some(SystemClauseType::GetCode), diff --git a/src/prolog/lib/pio.pl b/src/prolog/lib/pio.pl index 4c0f9046..4132752b 100644 --- a/src/prolog/lib/pio.pl +++ b/src/prolog/lib/pio.pl @@ -3,6 +3,8 @@ :- use_module(library(dcgs)). :- use_module(library(error)). +:- use_module(library(freeze)). +:- use_module(library(iso_ext), [setup_call_cleanup/3, partial_string/3]). :- use_module(library(lists), [member/2]). phrase_from_file(NT, File) :- @@ -19,6 +21,68 @@ phrase_from_file(NT, File, Options) :- member(Type, [text,binary]) ; Type = text ), - '$file_to_chars'(File, Chars, Type), - phrase(NT, Chars) - ). + setup_call_cleanup(open(File, read, Stream, [reposition(true)|Options]), + ( stream_to_lazy_list(pio:Type, Stream, Xs), + phrase(NT, Xs) ), + close(Stream)) + ). + + +stream_to_lazy_list(Type_3, Stream, Xs) :- + stream_property(Stream, position(Pos)), + freeze(Xs, reader_step(Type_3, Stream, Pos, Xs)). + +reader_step(Type_3, Stream, Pos, Xs0) :- + set_stream_position(Stream, Pos), + ( at_end_of_stream(Stream) + -> Xs0 = [] + ; % phrase(call(call(Type_3,Stream)), Xs0,Xs), % conforming call + call(Type_3, Stream, Cs,[]), % effective call + partial_string(Cs, Xs0, Xs), + stream_to_lazy_list(Type_3, Stream, Xs) + ). + +binary(Stream, Xs0, Xs) :- get_pending_bytes(Stream, Xs0, Xs). +text(Stream, Xs0, Xs) :- get_pending_chars(Stream, Xs0, Xs). + + +get_pending_chars(Stream, Chs0,Chs) :- + n_get_chars(4096, Stream, Chs, Chs0,Chs). + +% EOF means: If EOF == [], then EOF has definitely been reached, otherwise +% it is unknown and the argument remains uninstantiated. + +% To improve performance, the following predicates should be replaced +% by a fast Rust implementation that reads a number of characters (or +% bytes) at once. + +% Files that do not contain 0-bytes can even be mmapped to memory. + +n_get_chars(N0, Stream, EOF, Chs0,Chs) :- + N0 > 0, + N1 is N0-1, + get_char(Stream, Ch), + ( Ch == end_of_file + -> Chs0 = Chs, + EOF = [] + ; Chs0 = [Ch|Chs1], + n_get_chars(N1, Stream, EOF, Chs1,Chs) + ). +n_get_chars(0, _, _, Chs,Chs). + + +get_pending_bytes(Stream, Chs0,Chs) :- + n_get_bytes(4096, Stream, Chs, Chs0,Chs). + +n_get_bytes(N0, Stream, EOF, Chs0,Chs) :- + N0 > 0, + N1 is N0-1, + get_byte(Stream, Byte), + ( Byte == -1 + -> Chs0 = Chs, + EOF = [] + ; char_code(Ch, Byte), + Chs0 = [Ch|Chs1], + n_get_bytes(N1, Stream, EOF, Chs1,Chs) + ). +n_get_bytes(0, _, _, Chs,Chs). diff --git a/src/prolog/machine/system_calls.rs b/src/prolog/machine/system_calls.rs index d4607c59..e6867c99 100644 --- a/src/prolog/machine/system_calls.rs +++ b/src/prolog/machine/system_calls.rs @@ -27,7 +27,7 @@ use std::collections::BTreeSet; use std::convert::TryFrom; use std::io::{ErrorKind, Read, Write}; use std::iter::{once, FromIterator}; -use std::fs::{File, OpenOptions}; +use std::fs::{OpenOptions}; use std::net::{TcpListener, TcpStream}; use std::ops::Sub; use std::rc::Rc; @@ -1820,109 +1820,6 @@ impl MachineState { } }; } - &SystemClauseType::FileToChars => { - // TODO: Replace this with stream. - let a1 = self.store(self.deref(self[temp_v!(1)])); - let a2 = self.store(self.deref(self[temp_v!(2)])); - - let file_name = match a1 { - Addr::Con(h) if self.heap.atom_at(h) => { - if let HeapCellValue::Atom(name, _) = &self.heap[h] { - name.clone() - } - else { - unreachable!() - } - } - Addr::Char(c) => { - clause_name!(c.to_string(), indices.atom_tbl.clone()) - } - _ => { - unreachable!() - } - }; - - let name = clause_name!("$file_to_chars"); - let mut file = match File::open(file_name.as_str()) { - Ok(f) => f, - Err(e) => { - let arity = 2; - let stub = MachineError::functor_stub(name.clone(), arity); - let h = self.heap.h(); - - let err = match e.kind() { - ErrorKind::NotFound => { - MachineError::existence_error( - h, - ExistenceError::ModuleSource( - ModuleSource::File(file_name) - ), - ) - } - ErrorKind::PermissionDenied => { - let source_sink = self.store(self.deref(a1)); - - MachineError::permission_error( - h, - Permission::Access, - "source_sink", - source_sink - ) - } - _ => unreachable!() // Not nice. - }; - - return Err(self.error_form(err, stub)); - } - }; - - - let type_str = match self.store(self.deref(self[temp_v!(3)])) { - Addr::Con(h) if self.heap.atom_at(h) => { - if let HeapCellValue::Atom(ref atom, _) = &self.heap[h] { - atom.as_str() - } else { - unreachable!() - } - } - _ => { - unreachable!() - } - }; - - let complete_string = { - let mut buffer = String::new(); - match type_str { - "text" => { match file.read_to_string(&mut buffer) { - Ok(_size) => { - self.heap.put_complete_string(&buffer) - } - Err(_e) => { - // the data isn't valid UTF-8, so we fail. - self.fail = true; - return Ok(()); - - } - } - } - "binary" => { let mut buffer = Vec::new(); - let _ = match file.read_to_end(&mut buffer) { - Ok(size) => size, - Err(_e) => unreachable!() - }; - - let buffer = String::from_iter( - buffer.into_iter().map(|b| b as char) - ); - - self.heap.put_complete_string(&buffer) - } - _ => { unreachable!() } - } - }; - - self.unify(complete_string, a2); - } &SystemClauseType::PutCode => { let mut stream = self.get_stream_or_alias(self[temp_v!(1)], indices, "put_code", 2)?; -- 2.54.0