From 79829cb6d2d9bec25124864df7ff31779dcb7d8e Mon Sep 17 00:00:00 2001 From: Markus Triska Date: Sun, 14 Jun 2020 16:52:46 +0200 Subject: [PATCH] ADDED: library(sgml), providing load_html/3 to parse HTML documents --- Cargo.toml | 1 + README.md | 3 ++ src/clause_types.rs | 7 +++-- src/lib/sgml.pl | 41 +++++++++++++++++++++++++ src/machine/system_calls.rs | 60 +++++++++++++++++++++++++++++++++++++ 5 files changed, 110 insertions(+), 2 deletions(-) create mode 100644 src/lib/sgml.pl diff --git a/Cargo.toml b/Cargo.toml index a7cc588b..2085914f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,3 +44,4 @@ blake2 = "0.8.1" openssl = { version = "0.10.29", features = ["vendored"] } native-tls = "0.2.4" chrono = "0.4.11" +select = "0.4.3" diff --git a/README.md b/README.md index c7e6281f..3f3faca2 100644 --- a/README.md +++ b/README.md @@ -422,6 +422,9 @@ The modules that ship with Scryer Prolog are also called Probabilistic predicates and random number generators. * [`http/http_open`](src/lib/http/http_open.pl) Open a stream to read answers from web servers. HTTPS is also supported. +* [`sgml`](src/lib/sgml.pl) + `load_html/3` represents HTML documents as Prolog terms + for convenient and efficient reasoning. * [`sockets`](src/lib/sockets.pl) Predicates for opening and accepting TCP connections as streams. TLS negotiation is performed via the option `tls(true)` in diff --git a/src/clause_types.rs b/src/clause_types.rs index 18534668..5b4e56ff 100644 --- a/src/clause_types.rs +++ b/src/clause_types.rs @@ -298,7 +298,8 @@ pub enum SystemClauseType { Ed25519Sign, Ed25519Verify, Ed25519NewKeyPair, - Ed25519KeyPairPublicKey + Ed25519KeyPairPublicKey, + LoadHTML } impl SystemClauseType { @@ -495,7 +496,8 @@ impl SystemClauseType { &SystemClauseType::Ed25519Sign => clause_name!("$ed25519_sign"), &SystemClauseType::Ed25519Verify => clause_name!("$ed25519_verify"), &SystemClauseType::Ed25519NewKeyPair => clause_name!("$ed25519_new_keypair"), - &SystemClauseType::Ed25519KeyPairPublicKey => clause_name!("$ed25519_keypair_public_key") + &SystemClauseType::Ed25519KeyPairPublicKey => clause_name!("$ed25519_keypair_public_key"), + &SystemClauseType::LoadHTML => clause_name!("$load_html"), } } @@ -673,6 +675,7 @@ impl SystemClauseType { ("$ed25519_verify", 3) => Some(SystemClauseType::Ed25519Verify), ("$ed25519_new_keypair", 1) => Some(SystemClauseType::Ed25519NewKeyPair), ("$ed25519_keypair_public_key", 2) => Some(SystemClauseType::Ed25519KeyPairPublicKey), + ("$load_html", 3) => Some(SystemClauseType::LoadHTML), _ => None, } } diff --git a/src/lib/sgml.pl b/src/lib/sgml.pl new file mode 100644 index 00000000..2a15e425 --- /dev/null +++ b/src/lib/sgml.pl @@ -0,0 +1,41 @@ +/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + Predicates for parsing markup documents. + Written June 2020 by Markus Triska (triska@metalevel.at) + Part of Scryer Prolog. + + Currently, only a single predicate is provided: + + load_html(+In, -Es, +Options) + ============================= + + In must be a stream, specified as stream(S), and Es is unified with + a list of elements of the form: + + * a list of characters, representing text + + * element(Name, Attrs, Children) + - Name is the name of the tag + - Attrs is a list of Key=Value pairs: + Key is an atom, and Value is a list of characters + - Children is a list of elements as specified here. + + Currently, Options are ignored. In the future, more options may be + provided to control parsing. +- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ + +:- module(sgml, [load_html/3]). + +:- use_module(library(iso_ext)). +:- use_module(library(error)). + +load_html(stream(Stream), [E], Options) :- + must_be(list, Options), + read_to_end(Stream, Cs), + '$load_html'(Cs, E, []). + +read_to_end(Stream, Cs) :- + '$get_n_chars'(Stream, 4096, Cs0), + ( Cs0 = [] -> Cs = [] + ; partial_string(Cs0, Cs, Rest), + read_to_end(Stream, Rest) + ). diff --git a/src/machine/system_calls.rs b/src/machine/system_calls.rs index a3943b8b..75fd9a31 100644 --- a/src/machine/system_calls.rs +++ b/src/machine/system_calls.rs @@ -52,6 +52,8 @@ use crate::openssl::nid::Nid; use crate::native_tls::TlsConnector; +extern crate select; + pub fn get_key() -> KeyEvent { let key; enable_raw_mode().expect("failed to enable raw mode"); @@ -5559,10 +5561,68 @@ impl MachineState { _ => { self.fail = true; return Ok(()); } } } + &SystemClauseType::LoadHTML => { + let string = self.heap_pstr_iter(self[temp_v!(1)]).to_string(); + let doc = select::document::Document::from_read(string.as_bytes()).unwrap(); + let result = self.html_node_to_term(indices, doc.nth(0).unwrap()); + + self.unify(self[temp_v!(2)], result); + } }; return_from_clause!(self.last_call, self) } + + pub(super) + fn html_node_to_term( + &mut self, + indices: &mut IndexStore, + node: select::node::Node, + ) -> Addr { + match node.name() { + None => { let string = String::from(node.text()); + self.heap.put_complete_string(&string) + } + Some(name) => { + let mut avec = Vec::new(); + for attr in node.attrs() { + let chars = clause_name!(String::from(attr.0), indices.atom_tbl); + let name = self.heap.to_unifiable( + HeapCellValue::Atom(chars, None) + ); + + let value = self.heap.put_complete_string(&String::from(attr.1)); + + avec.push(HeapCellValue::Addr(Addr::HeapCell(self.heap.h()))); + + self.heap.push(HeapCellValue::NamedStr(2, clause_name!("="), None)); + self.heap.push(HeapCellValue::Addr(name)); + self.heap.push(HeapCellValue::Addr(value)); + } + let attrs = Addr::HeapCell(self.heap.to_list(avec.into_iter())); + + let mut cvec = Vec::new(); + for child in node.children() { + cvec.push(self.html_node_to_term(indices, child)); + } + let children = Addr::HeapCell(self.heap.to_list(cvec.into_iter())); + + let chars = clause_name!(String::from(name), indices.atom_tbl); + let tag = self.heap.to_unifiable( + HeapCellValue::Atom(chars, None) + ); + + let result = Addr::HeapCell(self.heap.h()); + + self.heap.push(HeapCellValue::NamedStr(3, clause_name!("element"), None)); + self.heap.push(HeapCellValue::Addr(tag)); + self.heap.push(HeapCellValue::Addr(attrs)); + self.heap.push(HeapCellValue::Addr(children)); + + result + } + } + } } -- 2.54.0