From 42d674950197882817e38d610de59f9d0cffa390 Mon Sep 17 00:00:00 2001 From: Mark Date: Tue, 26 Dec 2023 12:30:46 -0700 Subject: [PATCH] throw errors from char_reader.rs and get_n_chars when reading bad UTF8 data (#2244) --- src/machine/system_calls.rs | 6 ++++ src/parser/ast.rs | 3 ++ src/parser/char_reader.rs | 60 +++++++++++++++++++------------------ 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/src/machine/system_calls.rs b/src/machine/system_calls.rs index c5beaf9a..631cf626 100644 --- a/src/machine/system_calls.rs +++ b/src/machine/system_calls.rs @@ -3492,6 +3492,12 @@ impl Machine { Some(Ok(c)) => { string.push(c); } + Some(Err(e)) => { + let stub = functor_stub(atom!("$get_n_chars"), 3); + let err = self.machine_st.session_error(SessionError::from(e)); + + return Err(self.machine_st.error_form(err, stub)); + } _ => { break; } diff --git a/src/parser/ast.rs b/src/parser/ast.rs index ec46a74f..04d6fc99 100644 --- a/src/parser/ast.rs +++ b/src/parser/ast.rs @@ -417,6 +417,9 @@ impl ParserError { ParserError::IO(e) if e.kind() == ErrorKind::UnexpectedEof => { atom!("unexpected_end_of_file") } + ParserError::IO(e) if e.kind() == ErrorKind::InvalidData => { + atom!("invalid_data") + } ParserError::IO(_) => atom!("input_output_error"), ParserError::LexicalError(_) => atom!("lexical_error"), ParserError::MissingQuote(..) => atom!("missing_quote"), diff --git a/src/parser/char_reader.rs b/src/parser/char_reader.rs index bf3b8f8e..8553905d 100644 --- a/src/parser/char_reader.rs +++ b/src/parser/char_reader.rs @@ -144,6 +144,35 @@ impl CharRead for CharReader { Err(e) => return Some(Err(e)), } + let bad_bytes_error = |buf: &[u8]| { + // If we have 4 bytes that still don't make up + // a valid code point, then we have garbage. + + // We have bad data in the buffer. Remove + // leading bytes until either the buffer is + // empty, or we have a valid code point. + + let mut split_point = 1; + let mut badbytes = vec![]; + + loop { + let (bad, rest) = buf.split_at(split_point); + + if rest.is_empty() || str::from_utf8(rest).is_ok() { + badbytes.extend_from_slice(bad); + break; + } + + split_point += 1; + } + + // Raise the error. If we still have data in + // the buffer, it will be returned on the next + // loop. + + io::Error::new(io::ErrorKind::InvalidData, BadUtf8Error { bytes: badbytes }) + }; + loop { let buf = &self.buf[self.pos..]; @@ -159,35 +188,7 @@ impl CharRead for CharReader { }; if buf.len() - e.valid_up_to() >= 4 { - // If we have 4 bytes that still don't make up - // a valid code point, then we have garbage. - - // We have bad data in the buffer. Remove - // leading bytes until either the buffer is - // empty, or we have a valid code point. - - let mut split_point = 1; - let mut badbytes = vec![]; - - loop { - let (bad, rest) = buf.split_at(split_point); - - if rest.is_empty() || str::from_utf8(rest).is_ok() { - badbytes.extend_from_slice(bad); - break; - } - - split_point += 1; - } - - // Raise the error. If we still have data in - // the buffer, it will be returned on the next - // loop. - - return Some(Err(io::Error::new( - io::ErrorKind::InvalidData, - BadUtf8Error { bytes: badbytes }, - ))); + return Some(Err(bad_bytes_error(buf))); } else if self.pos >= self.buf.len() { return None; } else if self.buf.len() - self.pos >= 4 { @@ -223,6 +224,7 @@ impl CharRead for CharReader { match self.inner.read(word_slice) { Err(e) => return Some(Err(e)), + Ok(nread) if nread == 0 => return Some(Err(bad_bytes_error(&self.buf))), Ok(nread) => { self.buf.extend_from_slice(&word_slice[0..nread]); } -- 2.54.0