/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- Predicates for parsing markup documents.
+ Predicates for parsing HTML and XML documents.
Part of Scryer Prolog.
- Currently, only a single predicate is provided:
+ Currently, two predicates are provided:
- load_html(+In, -Es, +Options)
- =============================
+ - load_html(+Source, -Es, +Options)
+ - load_xml(+Source, -Es, +Options)
- In must be a stream, specified as stream(S), and Es is unified with
- a list of elements of the form:
+ These predicates parse HTML and XML documents, respectively.
- * a list of characters, representing text
+ Source must be a stream, specified as stream(S), or a file,
+ specified as file(Name), where Name is a list of characters, or a
+ list of characters with the document contents.
+
+ Es is unified with the abstract syntax tree of the parsed document,
+ represented as a list of elements where each is of the form:
+ * a list of characters, representing text
* element(Name, Attrs, Children)
- Name is the name of the tag
- Attrs is a list of Key=Value pairs:
Currently, Options are ignored. In the future, more options may be
provided to control parsing.
- Use http_open/3 from library(http/http_open) to read answers from
- web servers via streams.
+ Example:
+
+ ?- load_html("<html><head><title>Hello!</title></head></html>", Es, []).
+
+ Yielding:
+
+ Es = [element(html,[],
+ [element(head,[],
+ [element(title,[],
+ ["Hello!"])]),
+ element(body,[],[])])].
library(xpath) provides convenient reasoning about parsed documents.
+ For example, to fetch the title of the document above, we can use:
+
+ ?- load_html("<html><head><title>Hello!</title></head></html>", Es, []),
+ xpath(Es, //title(text), T).
+
+ Yielding T = "Hello!".
+
+ Use http_open/3 from library(http/http_open) to read answers from
+ web servers via streams.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
-:- module(sgml, [load_html/3]).
+:- module(sgml, [load_html/3,
+ load_xml/3]).
:- use_module(library(iso_ext)).
:- use_module(library(error)).
+:- use_module(library(dcgs)).
+:- use_module(library(pio)).
-load_html(stream(Stream), [E], Options) :-
+load_html(Source, Es, Options) :-
+ load_structure_(Source, Es, Options, html).
+load_xml(Source, Es, Options) :-
+ load_structure_(Source, Es, Options, xml).
+
+list([]) --> [].
+list([L|Ls]) --> [L], list(Ls).
+
+load_structure_([], [], _, _).
+load_structure_([C|Cs], [E], Options, What) :-
+ load_(What, [C|Cs], E, Options).
+load_structure_(file(Fs), [E], Options, What) :-
+ must_be(list, Options),
+ must_be(list, Fs),
+ atom_chars(File, Fs),
+ once(phrase_from_file(list(Cs), File)),
+ load_(What, Cs, E, Options).
+load_structure_(stream(Stream), [E], Options, What) :-
must_be(list, Options),
read_to_end(Stream, Cs),
- '$load_html'(Cs, E, []).
+ load_(What, Cs, E, Options).
+
+load_(html, Cs, E, Options) :- '$load_html'(Cs, E, Options).
+load_(xml, Cs, E, Options) :- '$load_xml'(Cs, E, Options).
read_to_end(Stream, Cs) :-
'$get_n_chars'(Stream, 4096, Cs0),
use crate::native_tls::TlsConnector;
extern crate select;
+use roxmltree;
pub fn get_key() -> KeyEvent {
let key;
self.unify(self[temp_v!(2)], result);
}
+ &SystemClauseType::LoadXML => {
+ let string = self.heap_pstr_iter(self[temp_v!(1)]).to_string();
+ match roxmltree::Document::parse(&string) {
+ Ok(doc) => { let result = self.xml_node_to_term(indices, doc.root_element());
+ self.unify(self[temp_v!(2)], result);
+ }
+ _ => { self.fail = true;
+ return Ok(());
+ }
+ }
+ }
};
return_from_clause!(self.last_call, self)
}
+ pub(super)
+ fn xml_node_to_term(
+ &mut self,
+ indices: &mut IndexStore,
+ node: roxmltree::Node,
+ ) -> Addr {
+ if node.has_children() {
+ let mut avec = Vec::new();
+ for attr in node.attributes() {
+ let chars = clause_name!(String::from(attr.name()), indices.atom_tbl);
+ let name = self.heap.to_unifiable(
+ HeapCellValue::Atom(chars, None)
+ );
+
+ let value = self.heap.put_complete_string(&attr.value());
+
+ avec.push(HeapCellValue::Addr(Addr::HeapCell(self.heap.h())));
+
+ self.heap.push(HeapCellValue::NamedStr(2, clause_name!("="), None));
+ self.heap.push(HeapCellValue::Addr(name));
+ self.heap.push(HeapCellValue::Addr(value));
+ }
+ let attrs = Addr::HeapCell(self.heap.to_list(avec.into_iter()));
+
+ let mut cvec = Vec::new();
+ for child in node.children() {
+ cvec.push(self.xml_node_to_term(indices, child));
+ }
+ let children = Addr::HeapCell(self.heap.to_list(cvec.into_iter()));
+
+ let chars = clause_name!(String::from(node.tag_name().name()), indices.atom_tbl);
+ let tag = self.heap.to_unifiable(
+ HeapCellValue::Atom(chars, None)
+ );
+
+ let result = Addr::HeapCell(self.heap.h());
+
+ self.heap.push(HeapCellValue::NamedStr(3, clause_name!("element"), None));
+ self.heap.push(HeapCellValue::Addr(tag));
+ self.heap.push(HeapCellValue::Addr(attrs));
+ self.heap.push(HeapCellValue::Addr(children));
+
+ result
+ } else {
+ let string = String::from(node.text().unwrap());
+ self.heap.put_complete_string(&string)
+ }
+ }
+
pub(super)
fn html_node_to_term(
&mut self,