From 42d674950197882817e38d610de59f9d0cffa390 Mon Sep 17 00:00:00 2001
From: Mark <markjordanthom@gmail.com>
Date: Tue, 26 Dec 2023 12:30:46 -0700
Subject: [PATCH] throw errors from char_reader.rs and get_n_chars when reading
 bad UTF8 data (#2244)

---
 src/machine/system_calls.rs |  6 ++++
 src/parser/ast.rs           |  3 ++
 src/parser/char_reader.rs   | 60 +++++++++++++++++++------------------
 3 files changed, 40 insertions(+), 29 deletions(-)
diff --git a/src/machine/system_calls.rs b/src/machine/system_calls.rs
index c5beaf9a..631cf626 100644
--- a/src/machine/system_calls.rs
+++ b/src/machine/system_calls.rs
@@ -3492,6 +3492,12 @@ impl Machine {
                     Some(Ok(c)) => {
                         string.push(c);
                     }
+                    Some(Err(e)) => {
+                        let stub = functor_stub(atom!("$get_n_chars"), 3);
+                        let err = self.machine_st.session_error(SessionError::from(e));
+
+                        return Err(self.machine_st.error_form(err, stub));
+                    }
                     _ => {
                         break;
                     }
diff --git a/src/parser/ast.rs b/src/parser/ast.rs
index ec46a74f..04d6fc99 100644
--- a/src/parser/ast.rs
+++ b/src/parser/ast.rs
@@ -417,6 +417,9 @@ impl ParserError {
             ParserError::IO(e) if e.kind() == ErrorKind::UnexpectedEof => {
                 atom!("unexpected_end_of_file")
             }
+            ParserError::IO(e) if e.kind() == ErrorKind::InvalidData => {
+                atom!("invalid_data")
+            }
             ParserError::IO(_) => atom!("input_output_error"),
             ParserError::LexicalError(_) => atom!("lexical_error"),
             ParserError::MissingQuote(..) => atom!("missing_quote"),
diff --git a/src/parser/char_reader.rs b/src/parser/char_reader.rs
index bf3b8f8e..8553905d 100644
--- a/src/parser/char_reader.rs
+++ b/src/parser/char_reader.rs
@@ -144,6 +144,35 @@ impl<R: Read> CharRead for CharReader<R> {
             Err(e) => return Some(Err(e)),
         }
 
+        let bad_bytes_error = |buf: &[u8]| {
+            // If we have 4 bytes that still don't make up
+            // a valid code point, then we have garbage.
+
+            // We have bad data in the buffer. Remove
+            // leading bytes until either the buffer is
+            // empty, or we have a valid code point.
+
+            let mut split_point = 1;
+            let mut badbytes = vec![];
+
+            loop {
+                let (bad, rest) = buf.split_at(split_point);
+
+                if rest.is_empty() || str::from_utf8(rest).is_ok() {
+                    badbytes.extend_from_slice(bad);
+                    break;
+                }
+
+                split_point += 1;
+            }
+
+            // Raise the error. If we still have data in
+            // the buffer, it will be returned on the next
+            // loop.
+
+            io::Error::new(io::ErrorKind::InvalidData, BadUtf8Error { bytes: badbytes })
+        };
+
         loop {
             let buf = &self.buf[self.pos..];
 
@@ -159,35 +188,7 @@ impl<R: Read> CharRead for CharReader<R> {
                 };
 
                 if buf.len() - e.valid_up_to() >= 4 {
-                    // If we have 4 bytes that still don't make up
-                    // a valid code point, then we have garbage.
-
-                    // We have bad data in the buffer. Remove
-                    // leading bytes until either the buffer is
-                    // empty, or we have a valid code point.
-
-                    let mut split_point = 1;
-                    let mut badbytes = vec![];
-
-                    loop {
-                        let (bad, rest) = buf.split_at(split_point);
-
-                        if rest.is_empty() || str::from_utf8(rest).is_ok() {
-                            badbytes.extend_from_slice(bad);
-                            break;
-                        }
-
-                        split_point += 1;
-                    }
-
-                    // Raise the error. If we still have data in
-                    // the buffer, it will be returned on the next
-                    // loop.
-
-                    return Some(Err(io::Error::new(
-                        io::ErrorKind::InvalidData,
-                        BadUtf8Error { bytes: badbytes },
-                    )));
+                    return Some(Err(bad_bytes_error(buf)));
                 } else if self.pos >= self.buf.len() {
                     return None;
                 } else if self.buf.len() - self.pos >= 4 {
@@ -223,6 +224,7 @@ impl<R: Read> CharRead for CharReader<R> {
 
                     match self.inner.read(word_slice) {
                         Err(e) => return Some(Err(e)),
+                        Ok(nread) if nread == 0 => return Some(Err(bad_bytes_error(&self.buf))),
                         Ok(nread) => {
                             self.buf.extend_from_slice(&word_slice[0..nread]);
                         }
-- 
2.54.0