]> Repositorios git - scryer-prolog.git/commitdiff
ADDED: load_xml/3 to load XML files from streams, files and strings.
authorMarkus Triska <[email protected]>
Sun, 21 Jun 2020 06:35:08 +0000 (08:35 +0200)
committerMarkus Triska <[email protected]>
Sun, 21 Jun 2020 07:45:37 +0000 (09:45 +0200)
Cargo.toml
README.md
src/clause_types.rs
src/lib/sgml.pl
src/machine/system_calls.rs

index 2085914f312dcff6b9b21f5480776b45b6bda037..bc13c81f179acaf4b8f409e98a8a6a8c07aa40ed 100644 (file)
@@ -45,3 +45,4 @@ openssl = { version = "0.10.29", features = ["vendored"] }
 native-tls = "0.2.4"
 chrono = "0.4.11"
 select = "0.4.3"
+roxmltree = "0.11.0"
index 9d5faaafaac697b65f45d4affbd2085638fe4507..d8c4acd84caa506b29ceb93daccde5a22a859c2a 100644 (file)
--- a/README.md
+++ b/README.md
@@ -425,8 +425,9 @@ The modules that ship with Scryer&nbsp;Prolog are also called
 * [`http/http_open`](src/lib/http/http_open.pl) Open a stream to
   read answers from web&nbsp;servers. HTTPS is also supported.
 * [`sgml`](src/lib/sgml.pl)
-  `load_html/3` represents HTML&nbsp;documents as Prolog&nbsp;terms
-  for convenient and efficient reasoning. See also `library(xpath)`.
+  `load_html/3` and `load_xml/3` represent HTML and XML&nbsp;documents
+  as Prolog&nbsp;terms for convenient and efficient reasoning. Use
+  `library(xpath)` to extract information from parsed documents.
 * [`xpath`](src/lib/xpath.pl)
   The predicate `xpath/3` is used for convenient reasoning about
   HTML and XML&nbsp;documents, inspired by the XPath language. This
index 5b4e56ffd26d113bcf32a93f7361379bdeb4dbb6..7b7923c75a059f9ad313b71f207465fa7ea6942c 100644 (file)
@@ -299,7 +299,8 @@ pub enum SystemClauseType {
     Ed25519Verify,
     Ed25519NewKeyPair,
     Ed25519KeyPairPublicKey,
-    LoadHTML
+    LoadHTML,
+    LoadXML,
 }
 
 impl SystemClauseType {
@@ -498,6 +499,7 @@ impl SystemClauseType {
             &SystemClauseType::Ed25519NewKeyPair => clause_name!("$ed25519_new_keypair"),
             &SystemClauseType::Ed25519KeyPairPublicKey => clause_name!("$ed25519_keypair_public_key"),
             &SystemClauseType::LoadHTML => clause_name!("$load_html"),
+            &SystemClauseType::LoadXML => clause_name!("$load_xml"),
         }
     }
 
@@ -676,6 +678,7 @@ impl SystemClauseType {
             ("$ed25519_new_keypair", 1) => Some(SystemClauseType::Ed25519NewKeyPair),
             ("$ed25519_keypair_public_key", 2) => Some(SystemClauseType::Ed25519KeyPairPublicKey),
             ("$load_html", 3) => Some(SystemClauseType::LoadHTML),
+            ("$load_xml", 3) => Some(SystemClauseType::LoadXML),
             _ => None,
         }
     }
index 568259209e811f9e1c5b2857a7a6725dcf1ae393..460646b791ea92fcc1e2d844aa857ddad52f05be 100644 (file)
@@ -1,18 +1,23 @@
 /* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-   Predicates for parsing markup documents.
+   Predicates for parsing HTML and XML documents.
    Written June 2020 by Markus Triska ([email protected])
    Part of Scryer Prolog.
 
-   Currently, only a single predicate is provided:
+   Currently, two predicates are provided:
 
-   load_html(+In, -Es, +Options)
-   =============================
+   -  load_html(+Source, -Es, +Options)
+   -  load_xml(+Source, -Es, +Options)
 
-   In must be a stream, specified as stream(S), and Es is unified with
-   a list of elements of the form:
+   These predicates parse HTML and XML documents, respectively.
 
-   * a list of characters, representing text
+   Source must be a stream, specified as stream(S), or a file,
+   specified as file(Name), where Name is a list of characters, or a
+   list of characters with the document contents.
+
+   Es is unified with the abstract syntax tree of the parsed document,
+   represented as a list of elements where each is of the form:
 
+   * a list of characters, representing text
    * element(Name, Attrs, Children)
      - Name is the name of the tag
      - Attrs is a list of Key=Value pairs:
    Currently, Options are ignored. In the future, more options may be
    provided to control parsing.
 
-   Use http_open/3 from library(http/http_open) to read answers from
-   web servers via streams.
+   Example:
+
+      ?- load_html("<html><head><title>Hello!</title></head></html>", Es, []).
+
+   Yielding:
+
+         Es = [element(html,[],
+                [element(head,[],
+                  [element(title,[],
+                    ["Hello!"])]),
+                 element(body,[],[])])].
 
    library(xpath) provides convenient reasoning about parsed documents.
+   For example, to fetch the title of the document above, we can use:
+
+      ?- load_html("<html><head><title>Hello!</title></head></html>", Es, []),
+         xpath(Es, //title(text), T).
+
+   Yielding T = "Hello!".
+
+   Use http_open/3 from library(http/http_open) to read answers from
+   web servers via streams.
 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 
-:- module(sgml, [load_html/3]).
+:- module(sgml, [load_html/3,
+                 load_xml/3]).
 
 :- use_module(library(iso_ext)).
 :- use_module(library(error)).
+:- use_module(library(dcgs)).
+:- use_module(library(pio)).
 
-load_html(stream(Stream), [E], Options) :-
+load_html(Source, Es, Options) :-
+        load_structure_(Source, Es, Options, html).
+load_xml(Source, Es, Options) :-
+        load_structure_(Source, Es, Options, xml).
+
+list([]) --> [].
+list([L|Ls]) --> [L], list(Ls).
+
+load_structure_([], [], _, _).
+load_structure_([C|Cs], [E], Options, What) :-
+        load_(What, [C|Cs], E, Options).
+load_structure_(file(Fs), [E], Options, What) :-
+        must_be(list, Options),
+        must_be(list, Fs),
+        atom_chars(File, Fs),
+        once(phrase_from_file(list(Cs), File)),
+        load_(What, Cs, E, Options).
+load_structure_(stream(Stream), [E], Options, What) :-
         must_be(list, Options),
         read_to_end(Stream, Cs),
-        '$load_html'(Cs, E, []).
+        load_(What, Cs, E, Options).
+
+load_(html, Cs, E, Options) :- '$load_html'(Cs, E, Options).
+load_(xml, Cs, E, Options) :- '$load_xml'(Cs, E, Options).
 
 read_to_end(Stream, Cs) :-
         '$get_n_chars'(Stream, 4096, Cs0),
index 75fd9a317a05e53cdf7b3364b114b04b8e107980..97c1686749a3afcae0bb8c03c668306a71037a44 100644 (file)
@@ -53,6 +53,7 @@ use crate::openssl::nid::Nid;
 use crate::native_tls::TlsConnector;
 
 extern crate select;
+use roxmltree;
 
 pub fn get_key() -> KeyEvent {
     let key;
@@ -5568,11 +5569,71 @@ impl MachineState {
 
                 self.unify(self[temp_v!(2)], result);
             }
+            &SystemClauseType::LoadXML => {
+                let string = self.heap_pstr_iter(self[temp_v!(1)]).to_string();
+                match roxmltree::Document::parse(&string) {
+                    Ok(doc) => { let result = self.xml_node_to_term(indices, doc.root_element());
+                                 self.unify(self[temp_v!(2)], result);
+                    }
+                    _ => { self.fail = true;
+                           return Ok(());
+                    }
+                }
+            }
         };
 
         return_from_clause!(self.last_call, self)
     }
 
+    pub(super)
+    fn xml_node_to_term(
+        &mut self,
+        indices: &mut IndexStore,
+        node: roxmltree::Node,
+    ) -> Addr {
+        if node.has_children() {
+            let mut avec = Vec::new();
+            for attr in node.attributes() {
+                let chars = clause_name!(String::from(attr.name()), indices.atom_tbl);
+                let name  = self.heap.to_unifiable(
+                    HeapCellValue::Atom(chars, None)
+                );
+
+                let value = self.heap.put_complete_string(&attr.value());
+
+                avec.push(HeapCellValue::Addr(Addr::HeapCell(self.heap.h())));
+
+                self.heap.push(HeapCellValue::NamedStr(2, clause_name!("="), None));
+                self.heap.push(HeapCellValue::Addr(name));
+                self.heap.push(HeapCellValue::Addr(value));
+            }
+            let attrs = Addr::HeapCell(self.heap.to_list(avec.into_iter()));
+
+            let mut cvec = Vec::new();
+            for child in node.children() {
+                cvec.push(self.xml_node_to_term(indices, child));
+            }
+            let children = Addr::HeapCell(self.heap.to_list(cvec.into_iter()));
+
+            let chars = clause_name!(String::from(node.tag_name().name()), indices.atom_tbl);
+            let tag  = self.heap.to_unifiable(
+                HeapCellValue::Atom(chars, None)
+            );
+
+            let result = Addr::HeapCell(self.heap.h());
+
+            self.heap.push(HeapCellValue::NamedStr(3, clause_name!("element"), None));
+            self.heap.push(HeapCellValue::Addr(tag));
+            self.heap.push(HeapCellValue::Addr(attrs));
+            self.heap.push(HeapCellValue::Addr(children));
+
+            result
+        } else {
+            let string = String::from(node.text().unwrap());
+            self.heap.put_complete_string(&string)
+        }
+    }
+
     pub(super)
     fn html_node_to_term(
         &mut self,