]> Repositorios git - scryer-prolog.git/commitdiff
ENHANCED: phrase_from_file/[2,3] now read the file lazily.
authorMarkus Triska <[email protected]>
Thu, 11 Jun 2020 13:29:06 +0000 (15:29 +0200)
committerMarkus Triska <[email protected]>
Thu, 11 Jun 2020 22:02:41 +0000 (00:02 +0200)
This allows processing extremely large files. In addition, the
efficient string representation helps to reduce memory usage.

These features unleash the full power of Prolog for text processing,
the exact use case Prolog was designed for.

src/prolog/clause_types.rs
src/prolog/lib/pio.pl
src/prolog/machine/system_calls.rs

index a62570824d6bcec311a6a703760b0a064d1b3b33..9b013dbc01ddb2ad67023655cabe9f05b7059b59 100644 (file)
@@ -180,7 +180,6 @@ pub enum SystemClauseType {
     ExpandTerm,
     FetchGlobalVar,
     FetchGlobalVarWithOffset,
-    FileToChars,
     FirstStream,
     FlushOutput,
     GetByte,
@@ -350,7 +349,6 @@ impl SystemClauseType {
             &SystemClauseType::FetchGlobalVarWithOffset => {
                 clause_name!("$fetch_global_var_with_offset")
             }
-            &SystemClauseType::FileToChars => clause_name!("$file_to_chars"),
             &SystemClauseType::FirstStream => clause_name!("$first_stream"),
             &SystemClauseType::FlushOutput => clause_name!("$flush_output"),
             &SystemClauseType::GetByte => clause_name!("$get_byte"),
@@ -549,7 +547,6 @@ impl SystemClauseType {
             ("$expand_goal", 2) => Some(SystemClauseType::ExpandGoal),
             ("$fetch_global_var", 2) => Some(SystemClauseType::FetchGlobalVar),
             ("$fetch_global_var_with_offset", 3) => Some(SystemClauseType::FetchGlobalVarWithOffset),
-            ("$file_to_chars", 3) => Some(SystemClauseType::FileToChars),
             ("$get_byte", 2) => Some(SystemClauseType::GetByte),
             ("$get_char", 2) => Some(SystemClauseType::GetChar),
             ("$get_code", 2) => Some(SystemClauseType::GetCode),
index 4c0f9046187692e5e0c7e1fad711225b5181d0b3..4132752b97f7fa76df47e3b670ec382883ec4e9d 100644 (file)
@@ -3,6 +3,8 @@
 
 :- use_module(library(dcgs)).
 :- use_module(library(error)).
+:- use_module(library(freeze)).
+:- use_module(library(iso_ext), [setup_call_cleanup/3, partial_string/3]).
 :- use_module(library(lists), [member/2]).
 
 phrase_from_file(NT, File) :-
@@ -19,6 +21,68 @@ phrase_from_file(NT, File, Options) :-
             member(Type, [text,binary])
         ;   Type = text
         ),
-        '$file_to_chars'(File, Chars, Type),
-        phrase(NT, Chars)
-    ).
+        setup_call_cleanup(open(File, read, Stream, [reposition(true)|Options]),
+                           (   stream_to_lazy_list(pio:Type, Stream, Xs),
+                               phrase(NT, Xs) ),
+                           close(Stream))
+   ).
+
+
+stream_to_lazy_list(Type_3, Stream, Xs) :-
+        stream_property(Stream, position(Pos)),
+        freeze(Xs, reader_step(Type_3, Stream, Pos, Xs)).
+
+reader_step(Type_3, Stream, Pos, Xs0) :-
+        set_stream_position(Stream, Pos),
+        (   at_end_of_stream(Stream)
+        ->  Xs0 = []
+        ;   % phrase(call(call(Type_3,Stream)), Xs0,Xs), % conforming call
+            call(Type_3, Stream, Cs,[]), % effective call
+            partial_string(Cs, Xs0, Xs),
+            stream_to_lazy_list(Type_3, Stream, Xs)
+        ).
+
+binary(Stream, Xs0, Xs) :- get_pending_bytes(Stream, Xs0, Xs).
+text(Stream, Xs0, Xs)   :- get_pending_chars(Stream, Xs0, Xs).
+
+
+get_pending_chars(Stream, Chs0,Chs) :-
+        n_get_chars(4096, Stream, Chs, Chs0,Chs).
+
+% EOF means: If EOF == [], then EOF has definitely been reached, otherwise
+% it is unknown and the argument remains uninstantiated.
+
+% To improve performance, the following predicates should be replaced
+% by a fast Rust implementation that reads a number of characters (or
+% bytes) at once.
+
+% Files that do not contain 0-bytes can even be mmapped to memory.
+
+n_get_chars(N0, Stream, EOF, Chs0,Chs) :-
+        N0 > 0,
+        N1 is N0-1,
+        get_char(Stream, Ch),
+        (   Ch == end_of_file
+        ->  Chs0 = Chs,
+            EOF = []
+        ;   Chs0 = [Ch|Chs1],
+            n_get_chars(N1, Stream, EOF, Chs1,Chs)
+        ).
+n_get_chars(0, _, _, Chs,Chs).
+
+
+get_pending_bytes(Stream, Chs0,Chs) :-
+        n_get_bytes(4096, Stream, Chs, Chs0,Chs).
+
+n_get_bytes(N0, Stream, EOF, Chs0,Chs) :-
+        N0 > 0,
+        N1 is N0-1,
+        get_byte(Stream, Byte),
+        (   Byte == -1
+        ->  Chs0 = Chs,
+            EOF = []
+        ;   char_code(Ch, Byte),
+            Chs0 = [Ch|Chs1],
+            n_get_bytes(N1, Stream, EOF, Chs1,Chs)
+        ).
+n_get_bytes(0, _, _, Chs,Chs).
index d4607c59e988fe04ed690233fde10654ba8ea16c..e6867c99d2433f7587b0403ff31d5029c38d1624 100644 (file)
@@ -27,7 +27,7 @@ use std::collections::BTreeSet;
 use std::convert::TryFrom;
 use std::io::{ErrorKind, Read, Write};
 use std::iter::{once, FromIterator};
-use std::fs::{File, OpenOptions};
+use std::fs::{OpenOptions};
 use std::net::{TcpListener, TcpStream};
 use std::ops::Sub;
 use std::rc::Rc;
@@ -1820,109 +1820,6 @@ impl MachineState {
                     }
                 };
             }
-            &SystemClauseType::FileToChars => {
-                // TODO: Replace this with stream.
-                let a1 = self.store(self.deref(self[temp_v!(1)]));
-                let a2 = self.store(self.deref(self[temp_v!(2)]));
-
-                let file_name = match a1 {
-                    Addr::Con(h) if self.heap.atom_at(h) => {
-                        if let HeapCellValue::Atom(name, _) = &self.heap[h] {
-                            name.clone()
-                        }
-                        else {
-                            unreachable!()
-                        }
-                    }
-                    Addr::Char(c) => {
-                        clause_name!(c.to_string(), indices.atom_tbl.clone())
-                    }
-                    _ => {
-                        unreachable!()
-                    }
-                };
-
-                let name = clause_name!("$file_to_chars");
-                let mut file = match File::open(file_name.as_str()) {
-                    Ok(f) => f,
-                    Err(e) => {
-                        let arity = 2;
-                        let stub = MachineError::functor_stub(name.clone(), arity);
-                        let h = self.heap.h();
-
-                        let err = match e.kind() {
-                            ErrorKind::NotFound => {
-                                MachineError::existence_error(
-                                    h,
-                                    ExistenceError::ModuleSource(
-                                        ModuleSource::File(file_name)
-                                    ),
-                                )
-                            }
-                            ErrorKind::PermissionDenied => {
-                                let source_sink = self.store(self.deref(a1));
-
-                                MachineError::permission_error(
-                                    h,
-                                    Permission::Access,
-                                    "source_sink",
-                                    source_sink
-                                )
-                            }
-                            _ => unreachable!()  // Not nice.
-                        };
-
-                        return Err(self.error_form(err, stub));
-                    }
-                };
-
-
-                let type_str = match self.store(self.deref(self[temp_v!(3)])) {
-                    Addr::Con(h) if self.heap.atom_at(h) => {
-                        if let HeapCellValue::Atom(ref atom, _) = &self.heap[h] {
-                            atom.as_str()
-                        } else {
-                            unreachable!()
-                        }
-                    }
-                    _ => {
-                        unreachable!()
-                    }
-                };
-
-                let complete_string = {
-                    let mut buffer = String::new();
-                    match type_str {
-                        "text" => {  match file.read_to_string(&mut buffer) {
-                                         Ok(_size) => {
-                                             self.heap.put_complete_string(&buffer)
-                                         }
-                                         Err(_e) => {
-                                            // the data isn't valid UTF-8, so we fail.
-                                           self.fail = true;
-                                           return Ok(());
-
-                                         }
-                                     }
-                                  }
-                        "binary" => { let mut buffer = Vec::new();
-                                      let _ = match file.read_to_end(&mut buffer) {
-                                          Ok(size) => size,
-                                          Err(_e) => unreachable!()
-                                      };
-
-                                      let buffer = String::from_iter(
-                                          buffer.into_iter().map(|b| b as char)
-                                      );
-
-                                      self.heap.put_complete_string(&buffer)
-                                    }
-                         _ => { unreachable!() }
-                         }
-                    };
-
-                self.unify(complete_string, a2);
-            }
             &SystemClauseType::PutCode => {
                 let mut stream =
                     self.get_stream_or_alias(self[temp_v!(1)], indices, "put_code", 2)?;