From 32eb896c564df9d09dba52c7b7374dd33229258c Mon Sep 17 00:00:00 2001 From: Markus Triska Date: Sun, 21 Jun 2020 08:35:08 +0200 Subject: [PATCH] ADDED: load_xml/3 to load XML files from streams, files and strings. --- Cargo.toml | 1 + README.md | 5 +-- src/clause_types.rs | 5 ++- src/lib/sgml.pl | 70 ++++++++++++++++++++++++++++++------- src/machine/system_calls.rs | 61 ++++++++++++++++++++++++++++++++ 5 files changed, 127 insertions(+), 15 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2085914f..bc13c81f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,3 +45,4 @@ openssl = { version = "0.10.29", features = ["vendored"] } native-tls = "0.2.4" chrono = "0.4.11" select = "0.4.3" +roxmltree = "0.11.0" diff --git a/README.md b/README.md index 9d5faaaf..d8c4acd8 100644 --- a/README.md +++ b/README.md @@ -425,8 +425,9 @@ The modules that ship with Scryer Prolog are also called * [`http/http_open`](src/lib/http/http_open.pl) Open a stream to read answers from web servers. HTTPS is also supported. * [`sgml`](src/lib/sgml.pl) - `load_html/3` represents HTML documents as Prolog terms - for convenient and efficient reasoning. See also `library(xpath)`. + `load_html/3` and `load_xml/3` represent HTML and XML documents + as Prolog terms for convenient and efficient reasoning. Use + `library(xpath)` to extract information from parsed documents. * [`xpath`](src/lib/xpath.pl) The predicate `xpath/3` is used for convenient reasoning about HTML and XML documents, inspired by the XPath language. This diff --git a/src/clause_types.rs b/src/clause_types.rs index 5b4e56ff..7b7923c7 100644 --- a/src/clause_types.rs +++ b/src/clause_types.rs @@ -299,7 +299,8 @@ pub enum SystemClauseType { Ed25519Verify, Ed25519NewKeyPair, Ed25519KeyPairPublicKey, - LoadHTML + LoadHTML, + LoadXML, } impl SystemClauseType { @@ -498,6 +499,7 @@ impl SystemClauseType { &SystemClauseType::Ed25519NewKeyPair => clause_name!("$ed25519_new_keypair"), &SystemClauseType::Ed25519KeyPairPublicKey => clause_name!("$ed25519_keypair_public_key"), &SystemClauseType::LoadHTML => clause_name!("$load_html"), + &SystemClauseType::LoadXML => clause_name!("$load_xml"), } } @@ -676,6 +678,7 @@ impl SystemClauseType { ("$ed25519_new_keypair", 1) => Some(SystemClauseType::Ed25519NewKeyPair), ("$ed25519_keypair_public_key", 2) => Some(SystemClauseType::Ed25519KeyPairPublicKey), ("$load_html", 3) => Some(SystemClauseType::LoadHTML), + ("$load_xml", 3) => Some(SystemClauseType::LoadXML), _ => None, } } diff --git a/src/lib/sgml.pl b/src/lib/sgml.pl index 56825920..460646b7 100644 --- a/src/lib/sgml.pl +++ b/src/lib/sgml.pl @@ -1,18 +1,23 @@ /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Predicates for parsing markup documents. + Predicates for parsing HTML and XML documents. Written June 2020 by Markus Triska (triska@metalevel.at) Part of Scryer Prolog. - Currently, only a single predicate is provided: + Currently, two predicates are provided: - load_html(+In, -Es, +Options) - ============================= + - load_html(+Source, -Es, +Options) + - load_xml(+Source, -Es, +Options) - In must be a stream, specified as stream(S), and Es is unified with - a list of elements of the form: + These predicates parse HTML and XML documents, respectively. - * a list of characters, representing text + Source must be a stream, specified as stream(S), or a file, + specified as file(Name), where Name is a list of characters, or a + list of characters with the document contents. + + Es is unified with the abstract syntax tree of the parsed document, + represented as a list of elements where each is of the form: + * a list of characters, representing text * element(Name, Attrs, Children) - Name is the name of the tag - Attrs is a list of Key=Value pairs: @@ -22,21 +27,62 @@ Currently, Options are ignored. In the future, more options may be provided to control parsing. - Use http_open/3 from library(http/http_open) to read answers from - web servers via streams. + Example: + + ?- load_html("Hello!", Es, []). + + Yielding: + + Es = [element(html,[], + [element(head,[], + [element(title,[], + ["Hello!"])]), + element(body,[],[])])]. library(xpath) provides convenient reasoning about parsed documents. + For example, to fetch the title of the document above, we can use: + + ?- load_html("Hello!", Es, []), + xpath(Es, //title(text), T). + + Yielding T = "Hello!". + + Use http_open/3 from library(http/http_open) to read answers from + web servers via streams. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */ -:- module(sgml, [load_html/3]). +:- module(sgml, [load_html/3, + load_xml/3]). :- use_module(library(iso_ext)). :- use_module(library(error)). +:- use_module(library(dcgs)). +:- use_module(library(pio)). -load_html(stream(Stream), [E], Options) :- +load_html(Source, Es, Options) :- + load_structure_(Source, Es, Options, html). +load_xml(Source, Es, Options) :- + load_structure_(Source, Es, Options, xml). + +list([]) --> []. +list([L|Ls]) --> [L], list(Ls). + +load_structure_([], [], _, _). +load_structure_([C|Cs], [E], Options, What) :- + load_(What, [C|Cs], E, Options). +load_structure_(file(Fs), [E], Options, What) :- + must_be(list, Options), + must_be(list, Fs), + atom_chars(File, Fs), + once(phrase_from_file(list(Cs), File)), + load_(What, Cs, E, Options). +load_structure_(stream(Stream), [E], Options, What) :- must_be(list, Options), read_to_end(Stream, Cs), - '$load_html'(Cs, E, []). + load_(What, Cs, E, Options). + +load_(html, Cs, E, Options) :- '$load_html'(Cs, E, Options). +load_(xml, Cs, E, Options) :- '$load_xml'(Cs, E, Options). read_to_end(Stream, Cs) :- '$get_n_chars'(Stream, 4096, Cs0), diff --git a/src/machine/system_calls.rs b/src/machine/system_calls.rs index 75fd9a31..97c16867 100644 --- a/src/machine/system_calls.rs +++ b/src/machine/system_calls.rs @@ -53,6 +53,7 @@ use crate::openssl::nid::Nid; use crate::native_tls::TlsConnector; extern crate select; +use roxmltree; pub fn get_key() -> KeyEvent { let key; @@ -5568,11 +5569,71 @@ impl MachineState { self.unify(self[temp_v!(2)], result); } + &SystemClauseType::LoadXML => { + let string = self.heap_pstr_iter(self[temp_v!(1)]).to_string(); + match roxmltree::Document::parse(&string) { + Ok(doc) => { let result = self.xml_node_to_term(indices, doc.root_element()); + self.unify(self[temp_v!(2)], result); + } + _ => { self.fail = true; + return Ok(()); + } + } + } }; return_from_clause!(self.last_call, self) } + pub(super) + fn xml_node_to_term( + &mut self, + indices: &mut IndexStore, + node: roxmltree::Node, + ) -> Addr { + if node.has_children() { + let mut avec = Vec::new(); + for attr in node.attributes() { + let chars = clause_name!(String::from(attr.name()), indices.atom_tbl); + let name = self.heap.to_unifiable( + HeapCellValue::Atom(chars, None) + ); + + let value = self.heap.put_complete_string(&attr.value()); + + avec.push(HeapCellValue::Addr(Addr::HeapCell(self.heap.h()))); + + self.heap.push(HeapCellValue::NamedStr(2, clause_name!("="), None)); + self.heap.push(HeapCellValue::Addr(name)); + self.heap.push(HeapCellValue::Addr(value)); + } + let attrs = Addr::HeapCell(self.heap.to_list(avec.into_iter())); + + let mut cvec = Vec::new(); + for child in node.children() { + cvec.push(self.xml_node_to_term(indices, child)); + } + let children = Addr::HeapCell(self.heap.to_list(cvec.into_iter())); + + let chars = clause_name!(String::from(node.tag_name().name()), indices.atom_tbl); + let tag = self.heap.to_unifiable( + HeapCellValue::Atom(chars, None) + ); + + let result = Addr::HeapCell(self.heap.h()); + + self.heap.push(HeapCellValue::NamedStr(3, clause_name!("element"), None)); + self.heap.push(HeapCellValue::Addr(tag)); + self.heap.push(HeapCellValue::Addr(attrs)); + self.heap.push(HeapCellValue::Addr(children)); + + result + } else { + let string = String::from(node.text().unwrap()); + self.heap.put_complete_string(&string) + } + } + pub(super) fn html_node_to_term( &mut self, -- 2.54.0