From 725def07cdc5f597cdc9901550683bc4e1094a02 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Bennet=20Ble=C3=9Fmann?= Date: Fri, 9 May 2025 23:45:03 +0200 Subject: [PATCH] fix crash when loading html --- src/lib/sgml.pl | 12 +++--- src/machine/system_calls.rs | 85 +++++++++++++++++++++++++++++++------ tests-pl/issue2949.pl | 11 +++++ tests/scryer/issues.rs | 6 +++ 4 files changed, 95 insertions(+), 19 deletions(-) create mode 100644 tests-pl/issue2949.pl diff --git a/src/lib/sgml.pl b/src/lib/sgml.pl index bccba1f3..b7933c9f 100644 --- a/src/lib/sgml.pl +++ b/src/lib/sgml.pl @@ -96,14 +96,14 @@ is_sgml_source([]). is_sgml_source([C|Cs]) :- must_be(chars, [C|Cs]). load_structure_([], [], _, _). -load_structure_([C|Cs], [E], Options, What) :- - load_(What, [C|Cs], E, Options). -load_structure_(file(Fs), [E], Options, What) :- +load_structure_([C|Cs], [E|Es], Options, What) :- + load_(What, [C|Cs], [E|Es], Options). +load_structure_(file(Fs), [E|Es], Options, What) :- once(phrase_from_file(seq(Cs), Fs)), - load_(What, Cs, E, Options). -load_structure_(stream(Stream), [E], Options, What) :- + load_(What, Cs, [E|Es], Options). +load_structure_(stream(Stream), [E|Es], Options, What) :- get_n_chars(Stream, _, Cs), - load_(What, Cs, E, Options). + load_(What, Cs, [E|Es], Options). load_(html, Cs, E, Options) :- '$load_html'(Cs, E, Options). load_(xml, Cs, E, Options) :- '$load_xml'(Cs, E, Options). diff --git a/src/machine/system_calls.rs b/src/machine/system_calls.rs index 3bee79d4..dc07cb78 100644 --- a/src/machine/system_calls.rs +++ b/src/machine/system_calls.rs @@ -8202,9 +8202,21 @@ impl Machine { .value_to_str_like(self.machine_st.registers[1]) { let document = scraper::Html::parse_document(&string.as_str()); - let result = self.html_node_to_term(document.tree.root().first_child().unwrap())?; - unify!(self.machine_st, self.machine_st.registers[2], result); + let root_nodes = document + .tree + .root() + .children() + .map(|child| self.html_node_to_term(child)) + .collect::, _>>()?; + + let nodes = sized_iter_to_heap_list( + &mut self.machine_st.heap, + root_nodes.len(), + root_nodes.into_iter(), + )?; + + unify!(self.machine_st, self.machine_st.registers[2], nodes); } else { self.machine_st.fail = true; } @@ -8651,12 +8663,39 @@ impl Machine { &mut self, node: ego_tree::NodeRef<'_, scraper::Node>, ) -> Result { - match node.value().as_element() { - None => self - .machine_st - .heap - .allocate_cstr(&node.value().as_text().unwrap().text), - Some(element) => { + match node.value() { + scraper::Node::Document | scraper::Node::Fragment => { + unreachable!("we never iterate the root itself only its children") + } + scraper::Node::Doctype(doctype) => { + // what about public and system id? + let name = self.machine_st.heap.allocate_cstr(&doctype.name)?; + + let result = str_loc_as_cell!(self.machine_st.heap.cell_len()); + let mut writer = self.machine_st.heap.reserve(2)?; + + writer.write_with(|section| { + section.push_cell(atom_as_cell!(atom!("doctype"), 1)); + section.push_cell(name); + }); + + Ok(result) + } + scraper::Node::Comment(comment) => { + let comment = self.machine_st.heap.allocate_cstr(&comment)?; + + let result = str_loc_as_cell!(self.machine_st.heap.cell_len()); + let mut writer = self.machine_st.heap.reserve(2)?; + + writer.write_with(|section| { + section.push_cell(atom_as_cell!(atom!("comment"), 1)); + section.push_cell(comment); + }); + + Ok(result) + } + scraper::Node::Text(text) => self.machine_st.heap.allocate_cstr(&text.text), + scraper::Node::Element(element) => { let mut avec = Vec::new(); for attr in element.attrs() { @@ -8680,11 +8719,10 @@ impl Machine { avec.into_iter(), )?; - let mut cvec = Vec::new(); - - for child in node.children() { - cvec.push(self.html_node_to_term(child)?); - } + let cvec = node + .children() + .map(|child| self.html_node_to_term(child)) + .collect::, _>>()?; let children = sized_iter_to_heap_list( &mut self.machine_st.heap, @@ -8703,6 +8741,27 @@ impl Machine { section.push_cell(children); }); + Ok(result) + } + scraper::Node::ProcessingInstruction(processing_instruction) => { + let target = self + .machine_st + .heap + .allocate_cstr(&processing_instruction.target)?; + let data = self + .machine_st + .heap + .allocate_cstr(&processing_instruction.data)?; + + let result = str_loc_as_cell!(self.machine_st.heap.cell_len()); + let mut writer = self.machine_st.heap.reserve(3)?; + + writer.write_with(|section| { + section.push_cell(atom_as_cell!(atom!("processing_instruction"), 2)); + section.push_cell(target); + section.push_cell(data); + }); + Ok(result) } } diff --git a/tests-pl/issue2949.pl b/tests-pl/issue2949.pl new file mode 100644 index 00000000..901bf031 --- /dev/null +++ b/tests-pl/issue2949.pl @@ -0,0 +1,11 @@ +:- use_module(library(sgml)). + +test :- + load_html("Hello!", Es, []), + write(Es), + load_html("Hello!", Es2, []), + write(Es2), + load_html("