]> Repositorios git - scryer-prolog.git/commitdiff
fix crash when loading html
authorBennet Bleßmann <[email protected]>
Fri, 9 May 2025 21:45:03 +0000 (23:45 +0200)
committerMark Thom <[email protected]>
Tue, 8 Jul 2025 05:39:47 +0000 (22:39 -0700)
src/lib/sgml.pl
src/machine/system_calls.rs
tests-pl/issue2949.pl [new file with mode: 0644]
tests/scryer/issues.rs

index bccba1f390493188353d0e49280fe447c8da6a98..b7933c9f9d93a31349a4caace127cb7c86ee5025 100644 (file)
@@ -96,14 +96,14 @@ is_sgml_source([]).
 is_sgml_source([C|Cs]) :- must_be(chars, [C|Cs]).
 
 load_structure_([], [], _, _).
-load_structure_([C|Cs], [E], Options, What) :-
-        load_(What, [C|Cs], E, Options).
-load_structure_(file(Fs), [E], Options, What) :-
+load_structure_([C|Cs], [E|Es], Options, What) :-
+        load_(What, [C|Cs], [E|Es], Options).
+load_structure_(file(Fs), [E|Es], Options, What) :-
         once(phrase_from_file(seq(Cs), Fs)),
-        load_(What, Cs, E, Options).
-load_structure_(stream(Stream), [E], Options, What) :-
+        load_(What, Cs, [E|Es], Options).
+load_structure_(stream(Stream), [E|Es], Options, What) :-
         get_n_chars(Stream, _, Cs),
-        load_(What, Cs, E, Options).
+        load_(What, Cs, [E|Es], Options).
 
 load_(html, Cs, E, Options) :- '$load_html'(Cs, E, Options).
 load_(xml, Cs, E, Options) :- '$load_xml'(Cs, E, Options).
index 3bee79d4c5623a08578482694da7b15be3781cb6..dc07cb7867841e1121ae71cecdc1683477fc046f 100644 (file)
@@ -8202,9 +8202,21 @@ impl Machine {
             .value_to_str_like(self.machine_st.registers[1])
         {
             let document = scraper::Html::parse_document(&string.as_str());
-            let result = self.html_node_to_term(document.tree.root().first_child().unwrap())?;
 
-            unify!(self.machine_st, self.machine_st.registers[2], result);
+            let root_nodes = document
+                .tree
+                .root()
+                .children()
+                .map(|child| self.html_node_to_term(child))
+                .collect::<Result<Vec<_>, _>>()?;
+
+            let nodes = sized_iter_to_heap_list(
+                &mut self.machine_st.heap,
+                root_nodes.len(),
+                root_nodes.into_iter(),
+            )?;
+
+            unify!(self.machine_st, self.machine_st.registers[2], nodes);
         } else {
             self.machine_st.fail = true;
         }
@@ -8651,12 +8663,39 @@ impl Machine {
         &mut self,
         node: ego_tree::NodeRef<'_, scraper::Node>,
     ) -> Result<HeapCellValue, usize> {
-        match node.value().as_element() {
-            None => self
-                .machine_st
-                .heap
-                .allocate_cstr(&node.value().as_text().unwrap().text),
-            Some(element) => {
+        match node.value() {
+            scraper::Node::Document | scraper::Node::Fragment => {
+                unreachable!("we never iterate the root itself only its children")
+            }
+            scraper::Node::Doctype(doctype) => {
+                // what about public and system id?
+                let name = self.machine_st.heap.allocate_cstr(&doctype.name)?;
+
+                let result = str_loc_as_cell!(self.machine_st.heap.cell_len());
+                let mut writer = self.machine_st.heap.reserve(2)?;
+
+                writer.write_with(|section| {
+                    section.push_cell(atom_as_cell!(atom!("doctype"), 1));
+                    section.push_cell(name);
+                });
+
+                Ok(result)
+            }
+            scraper::Node::Comment(comment) => {
+                let comment = self.machine_st.heap.allocate_cstr(&comment)?;
+
+                let result = str_loc_as_cell!(self.machine_st.heap.cell_len());
+                let mut writer = self.machine_st.heap.reserve(2)?;
+
+                writer.write_with(|section| {
+                    section.push_cell(atom_as_cell!(atom!("comment"), 1));
+                    section.push_cell(comment);
+                });
+
+                Ok(result)
+            }
+            scraper::Node::Text(text) => self.machine_st.heap.allocate_cstr(&text.text),
+            scraper::Node::Element(element) => {
                 let mut avec = Vec::new();
 
                 for attr in element.attrs() {
@@ -8680,11 +8719,10 @@ impl Machine {
                     avec.into_iter(),
                 )?;
 
-                let mut cvec = Vec::new();
-
-                for child in node.children() {
-                    cvec.push(self.html_node_to_term(child)?);
-                }
+                let cvec = node
+                    .children()
+                    .map(|child| self.html_node_to_term(child))
+                    .collect::<Result<Vec<_>, _>>()?;
 
                 let children = sized_iter_to_heap_list(
                     &mut self.machine_st.heap,
@@ -8703,6 +8741,27 @@ impl Machine {
                     section.push_cell(children);
                 });
 
+                Ok(result)
+            }
+            scraper::Node::ProcessingInstruction(processing_instruction) => {
+                let target = self
+                    .machine_st
+                    .heap
+                    .allocate_cstr(&processing_instruction.target)?;
+                let data = self
+                    .machine_st
+                    .heap
+                    .allocate_cstr(&processing_instruction.data)?;
+
+                let result = str_loc_as_cell!(self.machine_st.heap.cell_len());
+                let mut writer = self.machine_st.heap.reserve(3)?;
+
+                writer.write_with(|section| {
+                    section.push_cell(atom_as_cell!(atom!("processing_instruction"), 2));
+                    section.push_cell(target);
+                    section.push_cell(data);
+                });
+
                 Ok(result)
             }
         }
diff --git a/tests-pl/issue2949.pl b/tests-pl/issue2949.pl
new file mode 100644 (file)
index 0000000..901bf03
--- /dev/null
@@ -0,0 +1,11 @@
+:- use_module(library(sgml)).
+
+test :- 
+    load_html("<!DOCTYPE html><html><head><title>Hello!</title></head></html>", Es, []), 
+    write(Es),
+    load_html("<!DOCTYPE html><html><head><title>Hello!</title><!-- comment --></head></html>", Es2, []), 
+    write(Es2),
+    load_html("<!", Es3, []), 
+    write(Es3).
+
+:- initialization(test).
index 974183a4feb52fcd35af0419911630156049493f..de5db9155f7ff1695a0475e98334af1298669d5e 100644 (file)
@@ -20,6 +20,12 @@ fn issue2588_load_html() {
     load_module_test("tests-pl/issue2588.pl", "[element(html,[],[element(head,[],[element(title,[],[[H,e,l,l,o,!]])]),element(body,[],[])])]");
 }
 
+#[test]
+#[cfg_attr(miri, ignore = "unsupported operation when isolation is enabled")]
+fn issue2949_load_html() {
+    load_module_test("tests-pl/issue2949.pl", "[doctype([h,t,m,l]),element(html,[],[element(head,[],[element(title,[],[[H,e,l,l,o,!]])]),element(body,[],[])])][doctype([h,t,m,l]),element(html,[],[element(head,[],[element(title,[],[[H,e,l,l,o,!]]),comment([ ,c,o,m,m,e,n,t, ])]),element(body,[],[])])][comment([]),element(html,[],[element(head,[],[]),element(body,[],[])])]");
+}
+
 // issue #2361
 #[serial]
 #[test]