From 4ffdd9eb153ac73a41320c7b3b68978ca6c89920 Mon Sep 17 00:00:00 2001 From: Caden Haustein Date: Tue, 2 Feb 2021 18:35:41 -0600 Subject: [PATCH] Add prolog_parser --- Cargo.lock | 2 - Cargo.toml | 4 +- crates/prolog_parser/Cargo.toml | 21 + crates/prolog_parser/src/ast.rs | 815 ++++++++++++++++++++ crates/prolog_parser/src/lexer.rs | 898 ++++++++++++++++++++++ crates/prolog_parser/src/lib.rs | 15 + crates/prolog_parser/src/macros.rs | 187 +++++ crates/prolog_parser/src/parser.rs | 998 +++++++++++++++++++++++++ crates/prolog_parser/src/put_back_n.rs | 71 ++ crates/prolog_parser/src/tabled_rc.rs | 153 ++++ 10 files changed, 3160 insertions(+), 4 deletions(-) create mode 100644 crates/prolog_parser/Cargo.toml create mode 100644 crates/prolog_parser/src/ast.rs create mode 100644 crates/prolog_parser/src/lexer.rs create mode 100644 crates/prolog_parser/src/lib.rs create mode 100644 crates/prolog_parser/src/macros.rs create mode 100644 crates/prolog_parser/src/parser.rs create mode 100644 crates/prolog_parser/src/put_back_n.rs create mode 100644 crates/prolog_parser/src/tabled_rc.rs diff --git a/Cargo.lock b/Cargo.lock index c54283f8..6c55c9ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -880,8 +880,6 @@ dependencies = [ [[package]] name = "prolog_parser_rebis" version = "0.8.68" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52213fbb212208bf9e80c6d34bbfeb9ebef4dca28462f034e195e073e79c334" dependencies = [ "lexical", "num-rug-adapter", diff --git a/Cargo.toml b/Cargo.toml index 93caf8ff..084990ef 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,7 @@ libc = "0.2.62" nix = "0.15.0" num-rug-adapter = { optional = true, version = "0.1.4" } ordered-float = "0.5.0" -prolog_parser_rebis = { version = "0.8.68", default-features = false } +prolog_parser_rebis = { path = "./crates/prolog_parser", default-features = false } ref_thread_local = "0.0.0" rug = { version = "1.4.0", optional = true } rustyline = "7.0.0" @@ -48,4 +48,4 @@ select = "0.4.3" roxmltree = "0.11.0" base64 = "0.12.3" sodiumoxide = "0.2.6" -slice-deque = "0.3.0" \ No newline at end of file +slice-deque = "0.3.0" diff --git a/crates/prolog_parser/Cargo.toml b/crates/prolog_parser/Cargo.toml new file mode 100644 index 00000000..dfdc5a32 --- /dev/null +++ b/crates/prolog_parser/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "prolog_parser_rebis" +version = "0.8.68" +authors = ["Mark Thom "] +repository = "https://github.com/mthom/prolog_parser" +description = " An operator precedence parser for the Rebis development version of Scryer Prolog, an up and coming ISO Prolog implementation." +license = "BSD-3-Clause" + +[dependencies] +lexical = "2.1.0" +ordered-float = "0.5.0" +rug = { optional = true, version = "1.4.0" } +num-rug-adapter = { optional = true, version = "0.1.3" } +unicode_reader = "1.0.0" + +[lib] +path = "src/lib.rs" + +[features] +num = ["num-rug-adapter"] +default = ["rug"] diff --git a/crates/prolog_parser/src/ast.rs b/crates/prolog_parser/src/ast.rs new file mode 100644 index 00000000..d337e9c7 --- /dev/null +++ b/crates/prolog_parser/src/ast.rs @@ -0,0 +1,815 @@ +use rug::{Integer, Rational}; +use ordered_float::*; +use tabled_rc::*; + +use put_back_n::*; + +use std::cell::Cell; +use std::cmp::Ordering; +use std::collections::HashMap; +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::io::{Bytes, Error as IOError, Read}; +use std::ops::Deref; +use std::rc::Rc; +use std::vec::Vec; + +use unicode_reader::CodePoints; + +pub type Atom = String; + +pub type Var = String; + +pub type Specifier = u32; + +pub const MAX_ARITY: usize = 1023; + +pub const XFX: u32 = 0x0001; +pub const XFY: u32 = 0x0002; +pub const YFX: u32 = 0x0004; +pub const XF: u32 = 0x0010; +pub const YF: u32 = 0x0020; +pub const FX: u32 = 0x0040; +pub const FY: u32 = 0x0080; +pub const DELIMITER: u32 = 0x0100; +pub const TERM: u32 = 0x1000; +pub const LTERM: u32 = 0x3000; + +pub const NEGATIVE_SIGN: u32 = 0x0200; + +#[macro_export] +macro_rules! clause_name { + ($name: expr, $tbl: expr) => ( + ClauseName::User(TabledRc::new($name, $tbl.clone())) + ) ; + ($name: expr) => ( + ClauseName::BuiltIn($name) + ) +} + +#[macro_export] +macro_rules! atom { + ($e:expr, $tbl:expr) => ( + Constant::Atom(ClauseName::User(tabled_rc!($e, $tbl)), None) + ); + ($e:expr) => ( + Constant::Atom(clause_name!($e), None) + ) +} + +#[macro_export] +macro_rules! rc_atom { + ($e:expr) => ( + Rc::new(String::from($e)) + ) +} +macro_rules! is_term { + ($x:expr) => ( ($x & TERM) != 0 ) +} + +macro_rules! is_lterm { + ($x:expr) => ( ($x & LTERM) != 0 ) +} + +macro_rules! is_op { + ($x:expr) => ( $x & (XF | YF | FX | FY | XFX | XFY | YFX) != 0 ) +} + +macro_rules! is_negate { + ($x:expr) => ( ($x & NEGATIVE_SIGN) != 0 ) +} + +#[macro_export] +macro_rules! is_prefix { + ($x:expr) => ( $x & (FX | FY) != 0 ) +} + +#[macro_export] +macro_rules! is_postfix { + ($x:expr) => ( $x & (XF | YF) != 0 ) +} + +#[macro_export] +macro_rules! is_infix { + ($x:expr) => ( ($x & (XFX | XFY | YFX)) != 0 ) +} + +#[macro_export] +macro_rules! is_xfx { + ($x:expr) => ( ($x & XFX) != 0 ) +} + +#[macro_export] +macro_rules! is_xfy { + ($x:expr) => ( ($x & XFY) != 0 ) +} + +#[macro_export] +macro_rules! is_yfx { + ($x:expr) => ( ($x & YFX) != 0 ) +} + +#[macro_export] +macro_rules! is_yf { + ($x:expr) => ( ($x & YF) != 0 ) +} + +#[macro_export] +macro_rules! is_xf { + ($x:expr) => ( ($x & XF) != 0 ) +} + +#[macro_export] +macro_rules! is_fx { + ($x:expr) => ( ($x & FX) != 0 ) +} + +#[macro_export] +macro_rules! is_fy { + ($x:expr) => ( ($x & FY) != 0 ) +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum RegType { + Perm(usize), + Temp(usize) +} + +impl Default for RegType { + fn default() -> Self { + RegType::Temp(0) + } +} + +impl RegType { + pub fn reg_num(self) -> usize { + match self { + RegType::Perm(reg_num) | RegType::Temp(reg_num) => reg_num + } + } + + pub fn is_perm(self) -> bool { + match self { + RegType::Perm(_) => true, + _ => false + } + } +} + +impl fmt::Display for RegType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + &RegType::Perm(val) => write!(f, "Y{}", val), + &RegType::Temp(val) => write!(f, "X{}", val) + } + } +} + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum VarReg { + ArgAndNorm(RegType, usize), + Norm(RegType) +} + +impl VarReg { + pub fn norm(self) -> RegType { + match self { + VarReg::ArgAndNorm(reg, _) | VarReg::Norm(reg) => reg + } + } +} + +impl fmt::Display for VarReg { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + &VarReg::Norm(RegType::Perm(reg)) => write!(f, "Y{}", reg), + &VarReg::Norm(RegType::Temp(reg)) => write!(f, "X{}", reg), + &VarReg::ArgAndNorm(RegType::Perm(reg), arg) => + write!(f, "Y{} A{}", reg, arg), + &VarReg::ArgAndNorm(RegType::Temp(reg), arg) => + write!(f, "X{} A{}", reg, arg) + } + } +} + +impl Default for VarReg { + fn default() -> Self { + VarReg::Norm(RegType::default()) + } +} + +#[macro_export] +macro_rules! temp_v { + ($x:expr) => ( + RegType::Temp($x) + ) +} + +#[macro_export] +macro_rules! perm_v { + ($x:expr) => ( + RegType::Perm($x) + ) +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub enum GenContext { + Head, Mid(usize), Last(usize) // Mid & Last: chunk_num +} + +impl GenContext { + pub fn chunk_num(self) -> usize { + match self { + GenContext::Head => 0, + GenContext::Mid(cn) | GenContext::Last(cn) => cn + } + } +} + +pub type OpDirKey = (ClauseName, Fixity); + +#[derive(Debug, Clone)] +pub struct OpDirValue(pub SharedOpDesc); + +impl OpDirValue { + pub fn new(spec: Specifier, priority: usize) -> Self { + OpDirValue(SharedOpDesc::new(priority, spec)) + } + + #[inline] + pub fn shared_op_desc(&self) -> SharedOpDesc { + self.0.clone() + } +} + +// name and fixity -> operator type and precedence. +pub type OpDir = HashMap; + +#[derive(Debug, Clone, Copy)] +pub struct MachineFlags { + pub double_quotes: DoubleQuotes +} + +impl Default for MachineFlags { + fn default() -> Self { + MachineFlags { double_quotes: DoubleQuotes::default() } + } +} + +#[derive(Debug, Clone, Copy)] +pub enum DoubleQuotes { + Atom, Chars, Codes +} + +impl DoubleQuotes { + pub fn is_chars(self) -> bool { + if let DoubleQuotes::Chars = self { + true + } else { + false + } + } + + pub fn is_atom(self) -> bool { + if let DoubleQuotes::Atom = self { + true + } else { + false + } + } + + pub fn is_codes(self) -> bool { + if let DoubleQuotes::Codes = self { + true + } else { + false + } + } +} + +impl Default for DoubleQuotes { + fn default() -> Self { + DoubleQuotes::Chars + } +} + +pub fn default_op_dir() -> OpDir { + let mut op_dir = OpDir::new(); + + op_dir.insert((clause_name!(":-"), Fixity::In), OpDirValue::new(XFX, 1200)); + op_dir.insert((clause_name!(":-"), Fixity::Pre), OpDirValue::new(FX, 1200)); + op_dir.insert((clause_name!("?-"), Fixity::Pre), OpDirValue::new(FX, 1200)); + op_dir.insert((clause_name!(","), Fixity::In), OpDirValue::new(XFY, 1000)); + + op_dir +} + +#[derive(Debug, Clone)] +pub enum ArithmeticError { + NonEvaluableFunctor(Constant, usize), + UninstantiatedVar +} + +#[derive(Debug)] +pub enum ParserError { + BackQuotedString(usize, usize), + UnexpectedChar(char, usize, usize), + UnexpectedEOF, + IO(IOError), + IncompleteReduction(usize, usize), + InvalidSingleQuotedCharacter(char), + MissingQuote(usize, usize), + NonPrologChar(usize, usize), + ParseBigInt(usize, usize), + Utf8Error(usize, usize) +} + +impl ParserError { + pub fn line_and_col_num(&self) -> Option<(usize, usize)> { + match self { + &ParserError::BackQuotedString(line_num, col_num) + | &ParserError::UnexpectedChar(_, line_num, col_num) + | &ParserError::IncompleteReduction(line_num, col_num) + | &ParserError::MissingQuote(line_num, col_num) + | &ParserError::NonPrologChar(line_num, col_num) + | &ParserError::ParseBigInt(line_num, col_num) + | &ParserError::Utf8Error(line_num, col_num) => + Some((line_num, col_num)), + _ => + None + } + } + + pub fn as_str(&self) -> &'static str { + match self { + &ParserError::BackQuotedString(..) => + "back_quoted_string", + &ParserError::UnexpectedChar(..) => + "unexpected_char", + &ParserError::UnexpectedEOF => + "unexpected_end_of_file", + &ParserError::IncompleteReduction(..) => + "incomplete_reduction", + &ParserError::InvalidSingleQuotedCharacter(..) => + "invalid_single_quoted_character", + &ParserError::IO(_) => + "input_output_error", + &ParserError::MissingQuote(..) => + "missing_quote", + &ParserError::NonPrologChar(..) => + "non_prolog_character", + &ParserError::ParseBigInt(..) => + "cannot_parse_big_int", + &ParserError::Utf8Error(..) => + "utf8_conversion_error", + } + } +} + +impl From for ParserError { + fn from(err: IOError) -> ParserError { + ParserError::IO(err) + } +} + +impl From<&IOError> for ParserError { + fn from(error: &IOError) -> ParserError { + if error.get_ref().filter(|e| e.is::()).is_some() { + ParserError::Utf8Error(0, 0) + } else { + ParserError::IO(error.kind().into()) + } + } +} + + +#[derive(Debug, Clone, Copy)] +pub struct CompositeOpDir<'a, 'b> { + pub primary_op_dir: Option<&'b OpDir>, + pub secondary_op_dir: &'a OpDir, +} + +impl<'a, 'b> CompositeOpDir<'a, 'b> +{ + #[inline] + pub fn new(secondary_op_dir: &'a OpDir, primary_op_dir: Option<&'b OpDir>) -> Self { + CompositeOpDir { primary_op_dir, secondary_op_dir } + } + + #[inline] + pub(crate) + fn get(&self, name: ClauseName, fixity: Fixity) -> Option<&OpDirValue> + { + let entry = + if let Some(ref primary_op_dir) = &self.primary_op_dir { + primary_op_dir.get(&(name.clone(), fixity)) + } else { + None + }; + + entry.or_else(move || self.secondary_op_dir.get(&(name, fixity))) + } +} + + +#[derive(Debug, Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)] +pub enum Fixity { + In, Post, Pre +} + +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +pub struct SharedOpDesc(Rc>); + +impl SharedOpDesc { + #[inline] + pub fn new(priority: usize, spec: Specifier) -> Self { + SharedOpDesc(Rc::new(Cell::new((priority, spec)))) + } + + #[inline] + pub fn ptr_eq(lop_desc: &SharedOpDesc, rop_desc: &SharedOpDesc) -> bool { + Rc::ptr_eq(&lop_desc.0, &rop_desc.0) + } + + #[inline] + pub fn arity(&self) -> usize { + if self.get().1 & (XFX | XFY | YFX) == 0 { + 1 + } else { + 2 + } + } + + #[inline] + pub fn get(&self) -> (usize, Specifier) { + self.0.get() + } + + #[inline] + pub fn set(&self, prec: usize, spec: Specifier) { + self.0.set((prec, spec)); + } + + #[inline] + pub fn prec(&self) -> usize { + self.0.get().0 + } + + #[inline] + pub fn assoc(&self) -> Specifier { + self.0.get().1 + } +} + +impl Deref for SharedOpDesc { + type Target = Cell<(usize, Specifier)>; + + #[inline] + fn deref(&self) -> &Self::Target { + self.0.deref() + } +} + +// this ensures that SharedOpDesc (which is not consistently placed in +// every atom!) doesn't affect the value of an atom hash. If +// SharedOpDesc values are to be indexed, a BTreeMap or BTreeSet +// should be used, obviously. +impl Hash for SharedOpDesc { + fn hash(&self, state: &mut H) { + 0.hash(state) + } +} + +#[derive(Debug, Clone, Hash)] +pub enum Constant { + Atom(ClauseName, Option), + Char(char), + EmptyList, + Fixnum(isize), + Integer(Rc), + Rational(Rc), + Float(OrderedFloat), + String(Rc), + Usize(usize), +} + +impl fmt::Display for Constant { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + &Constant::Atom(ref atom, _) => + if atom.as_str().chars().any(|c| "`.$'\" ".contains(c)) { + write!(f, "'{}'", atom.as_str()) + } else { + write!(f, "{}", atom.as_str()) + }, + &Constant::Char(c) => + write!(f, "'{}'", c as u32), + &Constant::EmptyList => + write!(f, "[]"), + &Constant::Fixnum(n) => + write!(f, "{}", n), + &Constant::Integer(ref n) => + write!(f, "{}", n), + &Constant::Rational(ref n) => + write!(f, "{}", n), + &Constant::Float(ref n) => + write!(f, "{}", n), + &Constant::String(ref s) => + write!(f, "\"{}\"", &s), + &Constant::Usize(integer) => + write!(f, "u{}", integer), + } + } +} + +impl PartialEq for Constant { + fn eq(&self, other: &Constant) -> bool { + match (self, other) { + (&Constant::Atom(ref atom, _), &Constant::Char(c)) + | (&Constant::Char(c), &Constant::Atom(ref atom, _)) => { + atom.is_char() && Some(c) == atom.as_str().chars().next() + }, + (&Constant::Atom(ref a1, _), &Constant::Atom(ref a2, _)) => + a1.as_str() == a2.as_str(), + (&Constant::Char(c1), &Constant::Char(c2)) => + c1 == c2, + (&Constant::Fixnum(n1), &Constant::Fixnum(n2)) => + n1 == n2, + (&Constant::Fixnum(n1), &Constant::Integer(ref n2)) | + (&Constant::Integer(ref n2), &Constant::Fixnum(n1)) => { + if let Some(n2) = n2.to_isize() { + n1 == n2 + } else { + false + } + } + (&Constant::Integer(ref n1), &Constant::Integer(ref n2)) => + n1 == n2, + (&Constant::Rational(ref n1), &Constant::Rational(ref n2)) => + n1 == n2, + (&Constant::Float(ref n1), &Constant::Float(ref n2)) => + n1 == n2, + (&Constant::String(ref s1), &Constant::String(ref s2)) => { + &s1 == &s2 + } + (&Constant::EmptyList, &Constant::EmptyList) => + true, + (&Constant::Usize(u1), &Constant::Usize(u2)) => + u1 == u2, + _ => false + } + } +} + +impl Eq for Constant {} + +impl Constant { + pub fn to_atom(self) -> Option { + match self { + Constant::Atom(a, _) => Some(a.defrock_brackets()), + _ => None + } + } +} + +#[derive(Debug, Clone)] +pub enum ClauseName { + BuiltIn(&'static str), + User(TabledRc) +} + +impl fmt::Display for ClauseName { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl Hash for ClauseName { + fn hash(&self, state: &mut H) { + (*self.as_str()).hash(state) + } +} + +impl PartialEq for ClauseName { + fn eq(&self, other: &ClauseName) -> bool { + *self.as_str() == *other.as_str() + } +} + +impl Eq for ClauseName {} + +impl Ord for ClauseName { + fn cmp(&self, other: &ClauseName) -> Ordering { + (*self.as_str()).cmp(other.as_str()) + } +} + +impl PartialOrd for ClauseName { + fn partial_cmp(&self, other: &ClauseName) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> From<&'a TabledRc> for ClauseName { + fn from(name: &'a TabledRc) -> ClauseName { + ClauseName::User(name.clone()) + } +} + +impl ClauseName { + #[inline] + pub fn owning_module(&self) -> Self { + match self { + &ClauseName::User(ref name) => { + let module = name.owning_module(); + ClauseName::User(TabledRc { atom: module.clone(), + table: TabledData::new(module) }) + }, + _ => clause_name!("user") + } + } + + #[inline] + pub fn to_rc(&self) -> Rc { + match self { + &ClauseName::BuiltIn(s) => Rc::new(s.to_string()), + &ClauseName::User(ref rc) => rc.inner() + } + } + + #[inline] + pub fn with_table(self, atom_tbl: TabledData) -> Self { + match self { + ClauseName::BuiltIn(_) => self, + ClauseName::User(mut name) => { + name.table = atom_tbl; + ClauseName::User(name) + } + } + } + + #[inline] + pub fn has_table(&self, atom_tbl: &TabledData) -> bool { + match self { + ClauseName::BuiltIn(_) => false, + ClauseName::User(ref name) => &name.table == atom_tbl, + } + } + + #[inline] + pub fn has_table_of(&self, other: &ClauseName) -> bool { + match self { + ClauseName::BuiltIn(_) => { + if let ClauseName::BuiltIn(_) = other { + true + } else { + false + } + } + ClauseName::User(ref name) => { + other.has_table(&name.table) + } + } + } + + #[inline] + pub fn as_str(&self) -> &str { + match self { + &ClauseName::BuiltIn(s) => s, + &ClauseName::User(ref name) => name.as_ref() + } + } + + #[inline] + pub fn is_char(&self) -> bool { + !self.as_str().is_empty() && self.as_str().chars().skip(1).next().is_none() + } + + pub fn defrock_brackets(self) -> Self { + fn defrock_brackets(s: &str) -> &str { + if s.starts_with('(') && s.ends_with(')') { + &s[1 .. s.len() - 1] + } else { + s + } + } + + match self { + ClauseName::BuiltIn(s) => + ClauseName::BuiltIn(defrock_brackets(s)), + ClauseName::User(s) => + ClauseName::User(tabled_rc!(defrock_brackets(s.as_str()).to_owned(), s.table)) + } + } +} + +impl AsRef for ClauseName { + #[inline] + fn as_ref(self: &Self) -> &str { + self.as_str() + } +} + +#[derive(Debug, PartialEq, Eq, Clone)] +pub enum Term { + AnonVar, + Clause(Cell, ClauseName, Vec>, Option), + Cons(Cell, Box, Box), + Constant(Cell, Constant), + Var(Cell, Rc) +} + +impl Term { + pub fn shared_op_desc(&self) -> Option { + match self { + &Term::Clause(_, _, _, ref spec) => spec.clone(), + &Term::Constant(_, Constant::Atom(_, ref spec)) => spec.clone(), + _ => None + } + } + + pub fn to_constant(self) -> Option { + match self { + Term::Constant(_, c) => Some(c), + _ => None + } + } + + pub fn first_arg(&self) -> Option<&Term> { + match self { + &Term::Clause(_, _, ref terms, _) => + terms.first().map(|bt| bt.as_ref()), + _ => None + } + } + + pub fn set_name(&mut self, new_name: ClauseName) { + match self { + Term::Constant(_, Constant::Atom(ref mut atom, _)) + | Term::Clause(_, ref mut atom, ..) => { + *atom = new_name; + } + _ => {} + } + } + + pub fn name(&self) -> Option { + match self { + &Term::Constant(_, Constant::Atom(ref atom, _)) + | &Term::Clause(_, ref atom, ..) => Some(atom.clone()), + _ => None + } + } + + pub fn arity(&self) -> usize { + match self { + &Term::Clause(_, _, ref child_terms, ..) => child_terms.len(), + _ => 0 + } + } +} + +fn unfold_by_str_once(term: &mut Term, s: &str) -> Option<(Term, Term)> { + if let &mut Term::Clause(_, ref name, ref mut subterms, _) = term { + if name.as_str() == s && subterms.len() == 2 { + let snd = *subterms.pop().unwrap(); + let fst = *subterms.pop().unwrap(); + + return Some((fst, snd)); + } + } + + None +} + +pub fn unfold_by_str(mut term: Term, s: &str) -> Vec { + let mut terms = vec![]; + + while let Some((fst, snd)) = unfold_by_str_once(&mut term, s) { + terms.push(fst); + term = snd; + } + + terms.push(term); + terms +} + +pub type ParsingStream = PutBackN>>; + +use unicode_reader::BadUtf8Error; + +#[inline] +pub fn parsing_stream(src: R) -> Result, ParserError> { + let mut stream = put_back_n(CodePoints::from(src.bytes())); + match stream.peek() { + None => Ok(stream), // empty stream is handled gracefully by Lexer::eof + Some(Err(error)) => Err(ParserError::from(error)), + Some(Ok(c)) => { + if *c == '\u{feff}' { + // skip UTF-8 BOM + stream.next(); + } + Ok(stream) + } + } +} diff --git a/crates/prolog_parser/src/lexer.rs b/crates/prolog_parser/src/lexer.rs new file mode 100644 index 00000000..b1656a55 --- /dev/null +++ b/crates/prolog_parser/src/lexer.rs @@ -0,0 +1,898 @@ +use crate::lexical::parse_lossy; +use crate::ordered_float::*; +use crate::rug::Integer; + +use ast::*; +use tabled_rc::*; + +use std::convert::TryFrom; +use std::fmt; +use std::io::Read; +use std::rc::Rc; + +macro_rules! is_not_eof { + ($c:expr) => ( + match $c { + Ok(c) => c, + Err(ParserError::UnexpectedEOF) => return Ok(true), + Err(e) => return Err(e) + } + ) +} + +macro_rules! consume_chars_with { + ($token:expr, $e:expr) => { + loop { + match $e { + Ok(Some(c)) => $token.push(c), + Ok(None) => continue, + Err(ParserError::UnexpectedChar(..)) => break, + Err(e) => return Err(e) + } + } + } +} + +#[derive(Debug, Clone, PartialEq)] +pub enum Token { + Constant(Constant), + Var(Rc), + Open, // '(' + OpenCT, // '(' + Close, // ')' + OpenList, // '[' + CloseList, // ']' + OpenCurly, // '{' + CloseCurly, // '}' + HeadTailSeparator, // '|' + Comma, // ',' + End +} + +pub struct Lexer<'a, R: Read> { + pub(crate) atom_tbl: TabledData, + pub(crate) reader: &'a mut ParsingStream, + pub(crate) flags: MachineFlags, + pub(crate) line_num: usize, + pub(crate) col_num: usize +} + +impl<'a, R: Read + fmt::Debug> fmt::Debug for Lexer<'a, R> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Lexer") + .field("atom_tbl", &self.atom_tbl) + .field("reader", &"&'a mut ParsingStream") // Hacky solution. + .field("line_num", &self.line_num) + .field("col_num", &self.col_num) + .finish() + } +} + +impl<'a, R: Read> Lexer<'a, R> { + pub fn new( + atom_tbl: TabledData, + flags: MachineFlags, + src: &'a mut ParsingStream, + ) -> Self { + Lexer { atom_tbl, flags, reader: src, line_num: 0, col_num: 0 } + } + + fn return_char(&mut self, c: char) { + if new_line_char!(c) { + self.line_num -= 1; + self.col_num = 0; + } + + self.reader.put_back(Ok(c)); + } + + fn skip_char(&mut self) -> Result { + if let Some(Ok(c)) = self.reader.next() { + self.col_num += 1; + + if new_line_char!(c) { + self.line_num += 1; + self.col_num = 0; + } + + Ok(c) + } else { + Err(ParserError::UnexpectedEOF) + } + } + + pub fn eof(&mut self) -> Result { + if self.reader.peek().is_none() { + return Ok(true); + } + + let mut c = is_not_eof!(self.lookahead_char()); + + while layout_char!(c) { + self.skip_char()?; + + if self.reader.peek().is_none() { + return Ok(true); + } + + c = is_not_eof!(self.lookahead_char()); + } + + Ok(false) + } + + pub fn lookahead_char(&mut self) -> Result { + match self.reader.peek() { + Some(&Ok(c)) => Ok(c), + _ => Err(ParserError::UnexpectedEOF), + } + } + + fn single_line_comment(&mut self) -> Result<(), ParserError> + { + loop { + if self.reader.peek().is_none() || new_line_char!(self.skip_char()?) { + break; + } + } + + Ok(()) + } + + fn bracketed_comment(&mut self) -> Result { + // we have already checked that the current lookahead_char is comment_1_char, just skip it + let c = self.skip_char()?; + + if comment_2_char!(self.lookahead_char()?) { + self.skip_char()?; + + // Keep reading until we find characters '*' and '/' + // Deliberately skip checks for prolog_char to allow comments to contain any characters, + // including so-called "extended characters", without having to explicitly add them to a character class. + let mut c = self.lookahead_char()?; + loop { + while !comment_2_char!(c) { + self.skip_char()?; + c = self.lookahead_char()?; + } + + self.skip_char()?; + + c = self.lookahead_char()?; + if comment_1_char!(c) { + break; + } + } + + if prolog_char!(c) { + self.skip_char()?; + Ok(true) + } else { + Err(ParserError::NonPrologChar(self.line_num, self.col_num)) + } + } else { + self.return_char(c); + Ok(false) + } + } + + fn get_back_quoted_char(&mut self) -> Result { + if back_quote_char!(self.lookahead_char()?) { + let c = self.skip_char()?; + + if !back_quote_char!(self.lookahead_char()?) { + self.return_char(c); + Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)) + } else { + self.skip_char() + } + } else if single_quote_char!(self.lookahead_char()?) { + self.skip_char() + } else { + self.get_non_quote_char() + } + } + + fn get_back_quoted_item(&mut self) -> Result, ParserError> { + if backslash_char!(self.lookahead_char()?) { + let c = self.skip_char()?; + + if new_line_char!(self.lookahead_char()?) { + self.skip_char()?; + Ok(None) + } else { + self.return_char(c); + Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)) + } + } else { + self.get_back_quoted_char().map(Some) + } + } + + fn get_back_quoted_string(&mut self) -> Result { + let c = self.lookahead_char()?; + + if back_quote_char!(c) { + self.skip_char()?; + + let mut token = String::new(); + consume_chars_with!(token, self.get_back_quoted_item()); + + if back_quote_char!(self.lookahead_char()?) { + self.skip_char()?; + Ok(token) + } else { + Err(ParserError::MissingQuote(self.line_num, self.col_num)) + } + } else { + Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)) + } + } + + fn get_single_quoted_item(&mut self) -> Result, ParserError> + { + if backslash_char!(self.lookahead_char()?) { + let c = self.skip_char()?; + + if new_line_char!(self.lookahead_char()?) { + self.skip_char()?; + return Ok(None); + } else { + self.return_char(c); + } + } + + self.get_single_quoted_char().map(Some) + } + + fn get_single_quoted_char(&mut self) -> Result { + let c = self.lookahead_char()?; + + if single_quote_char!(c) { + self.skip_char()?; + + if !single_quote_char!(self.lookahead_char()?) { + self.return_char(c); + Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)) + } else { + self.skip_char() + } + } else if double_quote_char!(c) || back_quote_char!(c) { + self.skip_char() + } else { + self.get_non_quote_char() + } + } + + fn get_double_quoted_item(&mut self) -> Result, ParserError> + { + if backslash_char!(self.lookahead_char()?) { + let c = self.skip_char()?; + + if new_line_char!(self.lookahead_char()?) { + self.skip_char()?; + return Ok(None) + } else { + self.return_char(c); + } + } + + self.get_double_quoted_char().map(Some) + } + + fn get_double_quoted_char(&mut self) -> Result { + if double_quote_char!(self.lookahead_char()?) { + let c = self.skip_char()?; + + if !double_quote_char!(self.lookahead_char()?) { + self.return_char(c); + Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)) + } else { + self.skip_char() + } + } else if single_quote_char!(self.lookahead_char()?) { + self.skip_char() + } else if back_quote_char!(self.lookahead_char()?) { + self.skip_char() + } else { + self.get_non_quote_char() + } + } + + fn get_control_escape_sequence(&mut self) -> Result + { + let escaped = match self.lookahead_char()? { + 'a' => '\u{07}', // UTF-8 alert + 'b' => '\u{08}', // UTF-8 backspace + 'v' => '\u{0b}', // UTF-8 vertical tab + 'f' => '\u{0c}', // UTF-8 form feed + 't' => '\t', + 'n' => '\n', + 'r' => '\r', + c => return Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)) + }; + + self.skip_char()?; + return Ok(escaped); + } + + fn get_octal_escape_sequence(&mut self) -> Result + { + self.escape_sequence_to_char(|c| octal_digit_char!(c), 8) + } + + fn get_hexadecimal_escape_sequence(&mut self) -> Result + { + self.skip_char()?; + let c = self.lookahead_char()?; + + if hexadecimal_digit_char!(c) { + self.escape_sequence_to_char(|c| hexadecimal_digit_char!(c), 16) + } else { + Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)) + } + } + + fn escape_sequence_to_char( + &mut self, + accept_char: impl Fn(char) -> bool, + radix: u32, + ) -> Result { + let mut c = self.lookahead_char()?; + let mut token = String::new(); + + loop { + token.push(c); + + self.skip_char()?; + c = self.lookahead_char()?; + + if !accept_char(c) { + break; + } + } + + if backslash_char!(c) { + self.skip_char()?; + u32::from_str_radix(&token, radix) + .map_or_else( + |_| Err(ParserError::ParseBigInt(self.line_num, self.col_num)), + |n| char::try_from(n) + .map_err(|_| ParserError::Utf8Error(self.line_num, self.col_num)) + ) + } else { + // on failure, restore the token characters and backslash. + self.reader.put_back_all(token.chars().map(Ok)); + self.reader.put_back(Ok('\\')); + + Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)) + } + } + + fn get_non_quote_char(&mut self) -> Result { + let c = self.lookahead_char()?; + + if graphic_char!(c) || alpha_numeric_char!(c) || solo_char!(c) || space_char!(c) { + self.skip_char() + } else { + if !backslash_char!(c) { + return Err(ParserError::UnexpectedChar(c, self.line_num, self.col_num)); + } + + self.skip_char()?; + + let c = self.lookahead_char()?; + + if meta_char!(c) { + self.skip_char() + } else if octal_digit_char!(c) { + self.get_octal_escape_sequence() + } else if symbolic_hexadecimal_char!(c) { + self.get_hexadecimal_escape_sequence() + } else { + self.get_control_escape_sequence() + } + } + } + + fn char_code_list_token(&mut self) -> Result { + let mut token = String::new(); + + self.skip_char()?; + consume_chars_with!(token, self.get_double_quoted_item()); + + if double_quote_char!(self.lookahead_char()?) { + self.skip_char()?; + Ok(token) + } else { + Err(ParserError::MissingQuote(self.line_num, self.col_num)) + } + } + + fn hexadecimal_constant(&mut self) -> Result { + self.skip_char()?; + + if hexadecimal_digit_char!(self.lookahead_char()?) { + let mut token = String::new(); + + while hexadecimal_digit_char!(self.lookahead_char()?) { + token.push(self.skip_char()?); + } + + isize::from_str_radix(&token, 16) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + Integer::from_str_radix(&token, 16) + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } else { + self.return_char('x'); + Err(ParserError::ParseBigInt(self.line_num, self.col_num)) + } + } + + fn octal_constant(&mut self) -> Result { + self.skip_char()?; + + if octal_digit_char!(self.lookahead_char()?) { + let mut token = String::new(); + + while octal_digit_char!(self.lookahead_char()?) { + token.push(self.skip_char()?); + } + + isize::from_str_radix(&token, 8) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + Integer::from_str_radix(&token, 8) + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } else { + self.return_char('o'); + Err(ParserError::ParseBigInt(self.line_num, self.col_num)) + } + } + + fn binary_constant(&mut self) -> Result { + self.skip_char()?; + + if binary_digit_char!(self.lookahead_char()?) { + let mut token = String::new(); + + while binary_digit_char!(self.lookahead_char()?) { + token.push(self.skip_char()?); + } + + isize::from_str_radix(&token, 2) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + Integer::from_str_radix(&token, 2) + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } else { + self.return_char('b'); + Err(ParserError::ParseBigInt(self.line_num, self.col_num)) + } + } + + fn variable_token(&mut self) -> Result { + let mut s = String::new(); + s.push(self.skip_char()?); + + while alpha_numeric_char!(self.lookahead_char()?) { + s.push(self.skip_char()?); + } + + Ok(Token::Var(rc_atom!(s))) + } + + fn name_token(&mut self, c: char) -> Result { + let mut token = String::new(); + + if small_letter_char!(c) { + token.push(self.skip_char()?); + + while alpha_numeric_char!(self.lookahead_char()?) { + token.push(self.skip_char()?); + } + } else if graphic_token_char!(c) { + token.push(self.skip_char()?); + + while graphic_token_char!(self.lookahead_char()?) { + token.push(self.skip_char()?); + } + } else if cut_char!(c) { + token.push(self.skip_char()?); + } else if semicolon_char!(c) { + token.push(self.skip_char()?); + } else if single_quote_char!(c) { + self.skip_char()?; + + consume_chars_with!(token, self.get_single_quoted_item()); + + if single_quote_char!(self.lookahead_char()?) { + self.skip_char()?; + + if !token.is_empty() && token.chars().skip(1).next().is_none() { + if let Some(c) = token.chars().next() { + return Ok(Token::Constant(Constant::Char(c))); + } + } + } else { + return Err(ParserError::InvalidSingleQuotedCharacter(self.lookahead_char()?)) + } + } else { + match self.get_back_quoted_string() { + Ok(_) => return Err(ParserError::BackQuotedString(self.line_num, self.col_num)), + Err(e) => return Err(e) + } + } + + if token.as_str() == "[]" { + Ok(Token::Constant(Constant::EmptyList)) + } else { + Ok(Token::Constant(atom!(token, self.atom_tbl))) + } + } + + fn vacate_with_float(&mut self, mut token: String) -> Token { + self.return_char(token.pop().unwrap()); + + let result = OrderedFloat(parse_lossy::(token.as_bytes())); + Token::Constant(Constant::Float(result)) + } + + pub fn number_token(&mut self) -> Result { + let mut token = String::new(); + + token.push(self.skip_char()?); + let mut c = self.lookahead_char()?; + + while decimal_digit_char!(c) { + token.push(c); + self.skip_char()?; + c = self.lookahead_char()?; + } + + if decimal_point_char!(c) { + self.skip_char()?; + + if self.reader.peek().is_none() { + self.return_char('.'); + + isize::from_str_radix(&token, 10) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + token.parse::() + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } else if decimal_digit_char!(self.lookahead_char()?) { + token.push('.'); + token.push(self.skip_char()?); + + let mut c = self.lookahead_char()?; + + while decimal_digit_char!(c) { + token.push(c); + self.skip_char()?; + c = self.lookahead_char()?; + } + + if exponent_char!(self.lookahead_char()?) { + token.push(self.skip_char()?); + + let c = match self.lookahead_char() { + Err(_) => return Ok(self.vacate_with_float(token)), + Ok(c) => c + }; + + if !sign_char!(c) && !decimal_digit_char!(c) { + return Ok(self.vacate_with_float(token)); + } + + if sign_char!(c) { + token.push(self.skip_char()?); + + let c = match self.lookahead_char() { + Err(_) => { + self.return_char(token.pop().unwrap()); + return Ok(self.vacate_with_float(token)); + }, + Ok(c) => c + }; + + if !decimal_digit_char!(c) { + self.return_char(token.pop().unwrap()); + return Ok(self.vacate_with_float(token)); + } + } + + if decimal_digit_char!(self.lookahead_char()?) { + token.push(self.skip_char()?); + + while decimal_digit_char!(self.lookahead_char()?) { + token.push(self.skip_char()?); + } + + let n = OrderedFloat(parse_lossy::(token.as_bytes())); + Ok(Token::Constant(Constant::Float(n))) + } else { + return Ok(self.vacate_with_float(token)); + } + } else { + let n = OrderedFloat(parse_lossy::(token.as_bytes())); + Ok(Token::Constant(Constant::Float(n))) + } + } else { + self.return_char('.'); + + isize::from_str_radix(&token, 10) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + token.parse::() + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } + } else { + if token.starts_with('0') && token.len() == 1 { + if c == 'x' { + self.hexadecimal_constant() + .or_else(|e| { + if let ParserError::ParseBigInt(..) = e { + isize::from_str_radix(&token, 10) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + token.parse::() + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } else { + Err(e) + } + }) + } else if c == 'o' { + self.octal_constant() + .or_else(|e| { + if let ParserError::ParseBigInt(..) = e { + isize::from_str_radix(&token, 10) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + token.parse::() + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } else { + Err(e) + } + }) + } else if c == 'b' { + self.binary_constant() + .or_else(|e| { + if let ParserError::ParseBigInt(..) = e { + isize::from_str_radix(&token, 10) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + token.parse::() + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } else { + Err(e) + } + }) + } else if single_quote_char!(c) { + self.skip_char()?; + + if backslash_char!(self.lookahead_char()?) { + self.skip_char()?; + + if new_line_char!(self.lookahead_char()?) { + self.return_char('\\'); + self.return_char('\''); + + return Ok(Token::Constant(Constant::Fixnum(0))); + } else { + self.return_char('\\'); + } + } + + self.get_single_quoted_char() + .and_then(|c| { + Ok(Token::Constant(Constant::Fixnum(c as isize))) + }) + .or_else(|_| { + self.return_char(c); + + isize::from_str_radix(&token, 10) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + token.parse::() + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + }) + } else { + isize::from_str_radix(&token, 10) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + token.parse::() + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } + } else { + isize::from_str_radix(&token, 10) + .map(|n| Token::Constant(Constant::Fixnum(n))) + .or_else(|_| { + token.parse::() + .map(|n| Token::Constant(Constant::Integer(Rc::new(n)))) + .map_err(|_| ParserError::ParseBigInt( + self.line_num, + self.col_num, + )) + }) + } + } + } + + pub fn scan_for_layout(&mut self) -> Result { + let mut layout_inserted = false; + let mut more_layout = true; + + loop { + let cr = self.lookahead_char(); + + match cr { + Ok(c) if layout_char!(c) || new_line_char!(c) => { + self.skip_char()?; + layout_inserted = true; + }, + Ok(c) if end_line_comment_char!(c) => { + self.single_line_comment()?; + layout_inserted = true; + }, + Ok(c) if comment_1_char!(c) => + if self.bracketed_comment()? { + layout_inserted = true; + } else { + more_layout = false; + }, + _ => more_layout = false + }; + + if !more_layout { + break; + } + } + + Ok(layout_inserted) + } + + pub fn next_token(&mut self) -> Result { + let layout_inserted = self.scan_for_layout()?; + let cr = self.lookahead_char(); + + match cr { + Ok(c) => { + if capital_letter_char!(c) || variable_indicator_char!(c) { + return self.variable_token(); + } + + if c == ',' { + self.skip_char()?; + return Ok(Token::Comma); + } + + if c == ')' { + self.skip_char()?; + return Ok(Token::Close); + } + + if c == '(' { + self.skip_char()?; + return Ok(if layout_inserted { Token::Open } + else { Token::OpenCT }); + } + + if c == '.' { + self.skip_char()?; + + match self.lookahead_char() { + Ok(c) if layout_char!(c) || c == '%' => { + if new_line_char!(c) { + self.skip_char()?; + } + + return Ok(Token::End); + }, + Err(ParserError::UnexpectedEOF) => { + return Ok(Token::End); + } + _ => { + self.return_char('.'); + } + }; + } + + if decimal_digit_char!(c) { + return self.number_token(); + } + + if c == ']' { + self.skip_char()?; + return Ok(Token::CloseList); + } + + if c == '[' { + self.skip_char()?; + return Ok(Token::OpenList); + } + + if c == '|' { + self.skip_char()?; + return Ok(Token::HeadTailSeparator); + } + + if c == '{' { + self.skip_char()?; + return Ok(Token::OpenCurly); + } + + if c == '}' { + self.skip_char()?; + return Ok(Token::CloseCurly); + } + + if c == '"' { + let s = self.char_code_list_token()?; + + if let DoubleQuotes::Atom = self.flags.double_quotes { + let s = clause_name!(s, self.atom_tbl); + return Ok(Token::Constant(Constant::Atom(s, None))); + } else { + let s = Rc::new(s); + return Ok(Token::Constant(Constant::String(s))); + } + } + + self.name_token(c) + }, + Err(e) => Err(e) + } + } +} diff --git a/crates/prolog_parser/src/lib.rs b/crates/prolog_parser/src/lib.rs new file mode 100644 index 00000000..f8ef7536 --- /dev/null +++ b/crates/prolog_parser/src/lib.rs @@ -0,0 +1,15 @@ +extern crate lexical; +extern crate ordered_float; +#[cfg(feature = "rug")] +extern crate rug; +#[cfg(feature = "num-rug-adapter")] +extern crate num_rug_adapter as rug; +extern crate unicode_reader; + +#[macro_use] pub mod tabled_rc; +#[macro_use] pub mod ast; +#[macro_use] pub mod macros; +pub mod parser; +pub mod put_back_n; + +pub mod lexer; diff --git a/crates/prolog_parser/src/macros.rs b/crates/prolog_parser/src/macros.rs new file mode 100644 index 00000000..e4520aba --- /dev/null +++ b/crates/prolog_parser/src/macros.rs @@ -0,0 +1,187 @@ +#[macro_export] +macro_rules! char_class { + ($c: expr, [$head:expr]) => ($c == $head); + ($c: expr, [$head:expr $(, $cs:expr)+]) => ($c == $head || char_class!($c, [$($cs),*])); +} + +#[macro_export] +macro_rules! symbolic_control_char { + ($c: expr) => (char_class!($c, ['a', 'b', 'f', 'n', 'r', 't', 'v', '0'])) +} + +#[macro_export] +macro_rules! space_char { + ($c: expr) => ($c == ' ') +} + +#[macro_export] +macro_rules! layout_char { + ($c: expr) => (char_class!($c, [' ', '\n', '\t', '\u{0B}', '\u{0C}'])) +} + +#[macro_export] +macro_rules! symbolic_hexadecimal_char { + ($c: expr) => ($c == 'x') +} + +#[macro_export] +macro_rules! octal_digit_char { + ($c: expr) => ($c >= '0' && $c <= '7') +} + +#[macro_export] +macro_rules! binary_digit_char { + ($c: expr) => ($c >= '0' && $c <= '1') +} + +#[macro_export] +macro_rules! hexadecimal_digit_char { + ($c: expr) => ($c >= '0' && $c <= '9' || + $c >= 'A' && $c <= 'F' || + $c >= 'a' && $c <= 'f') +} + +#[macro_export] +macro_rules! exponent_char { + ($c: expr) => ($c == 'e' || $c == 'E') +} + +#[macro_export] +macro_rules! sign_char { + ($c: expr) => ($c == '-' || $c == '+') +} + +#[macro_export] +macro_rules! new_line_char { + ($c: expr) => ($c == '\n') +} + +#[macro_export] +macro_rules! end_line_comment_char { + ($c: expr) => ($c == '%') +} + +#[macro_export] +macro_rules! comment_1_char { + ($c: expr) => ($c == '/') +} + +#[macro_export] +macro_rules! comment_2_char { + ($c: expr) => ($c == '*') +} + +#[macro_export] +macro_rules! capital_letter_char { + ($c: expr) => ($c >= 'A' && $c <= 'Z') +} + +#[macro_export] +macro_rules! small_letter_char { + ($c: expr) => ($c >= 'a' && $c <= 'z') +} + +#[macro_export] +macro_rules! variable_indicator_char { + ($c: expr) => ($c == '_') +} + +#[macro_export] +macro_rules! graphic_char { + ($c: expr) => (char_class!($c, ['#', '$', '&', '*', '+', '-', '.', '/', ':', + '<', '=', '>', '?', '@', '^', '~'])) +} + +#[macro_export] +macro_rules! graphic_token_char { + ($c: expr) => (graphic_char!($c) || backslash_char!($c)) +} + +#[macro_export] +macro_rules! alpha_char { + ($c: expr) => + (match $c { + 'a' ..= 'z' => true, + 'A' ..= 'Z' => true, + '_' => true, + '\u{00A0}' ..= '\u{00BF}' => true, + '\u{00C0}' ..= '\u{00D6}' => true, + '\u{00D8}' ..= '\u{00F6}' => true, + '\u{00F8}' ..= '\u{00FF}' => true, + '\u{0100}' ..= '\u{017F}' => true, // Latin Extended-A + '\u{0180}' ..= '\u{024F}' => true, // Latin Extended-B + '\u{0250}' ..= '\u{02AF}' => true, // IPA Extensions + '\u{02B0}' ..= '\u{02FF}' => true, // Spacing Modifier Letters + '\u{0300}' ..= '\u{036F}' => true, // Combining Diacritical Marks + '\u{0370}' ..= '\u{03FF}' => true, // Greek/Coptic + '\u{0400}' ..= '\u{04FF}' => true, // Cyrillic + '\u{0500}' ..= '\u{052F}' => true, // Cyrillic Supplement + '\u{0530}' ..= '\u{058F}' => true, // Armenian + '\u{0590}' ..= '\u{05FF}' => true, // Hebrew + '\u{0600}' ..= '\u{06FF}' => true, // Arabic + '\u{0700}' ..= '\u{074F}' => true, // Syriac + _ => false + }) +} + +#[macro_export] +macro_rules! decimal_digit_char { + ($c: expr) => ($c >= '0' && $c <= '9') +} + +#[macro_export] +macro_rules! decimal_point_char { + ($c: expr) => ($c == '.') +} + +#[macro_export] +macro_rules! alpha_numeric_char { + ($c: expr) => (alpha_char!($c) || decimal_digit_char!($c)) +} + +#[macro_export] +macro_rules! cut_char { + ($c: expr) => ($c == '!') +} + +#[macro_export] +macro_rules! semicolon_char { + ($c: expr) => ($c == ';') +} + +#[macro_export] +macro_rules! backslash_char { + ($c: expr) => ($c == '\\') +} + +#[macro_export] +macro_rules! single_quote_char { + ($c: expr) => ($c == '\'') +} + +#[macro_export] +macro_rules! double_quote_char { + ($c: expr) => ($c == '"') +} + +#[macro_export] +macro_rules! back_quote_char { + ($c: expr) => ($c == '`') +} + +#[macro_export] +macro_rules! meta_char { + ($c: expr) => ( char_class!($c, ['\\', '\'', '"', '`']) ) +} + +#[macro_export] +macro_rules! solo_char { + ($c: expr) => ( char_class!($c, ['!', '(', ')', ',', ';', '[', ']', + '{', '}', '|', '%']) ) +} + +#[macro_export] +macro_rules! prolog_char { + ($c: expr) => (graphic_char!($c) || alpha_numeric_char!($c) || solo_char!($c) || + layout_char!($c) || meta_char!($c)) +} diff --git a/crates/prolog_parser/src/parser.rs b/crates/prolog_parser/src/parser.rs new file mode 100644 index 00000000..19fd7912 --- /dev/null +++ b/crates/prolog_parser/src/parser.rs @@ -0,0 +1,998 @@ +use ast::*; +use lexer::*; +use tabled_rc::*; + +use ordered_float::OrderedFloat; + +use rug::ops::NegAssign; + +use std::cell::Cell; +use std::io::Read; +use std::mem::swap; +use std::rc::Rc; + +#[derive(Debug, Clone, Copy, PartialEq)] +enum TokenType { + Term, + Open, + OpenCT, + OpenList, // '[' + OpenCurly, // '{' + HeadTailSeparator, // '|' + Comma, // ',' + Close, + CloseList, // ']' + CloseCurly, // '}' + End +} + +impl TokenType { + fn is_sep(self) -> bool { + match self { + TokenType::HeadTailSeparator | TokenType::OpenCT | TokenType::Open | + TokenType::Close | TokenType::OpenList | TokenType::CloseList | + TokenType::OpenCurly | TokenType::CloseCurly | TokenType::Comma + => true, + _ => false + } + } +} + +#[derive(Debug, Clone, Copy)] +struct TokenDesc { + tt: TokenType, + priority: usize, + spec: u32 +} + +pub +fn get_clause_spec(name: ClauseName, arity: usize, op_dir: &CompositeOpDir) -> Option +{ + match arity { + 1 => { + /* This is a clause with an operator principal functor. Prefix operators + are supposed over post. + */ + if let Some(OpDirValue(cell)) = op_dir.get(name.clone(), Fixity::Pre) { + return Some(cell.clone()); + } + + if let Some(OpDirValue(cell)) = op_dir.get(name, Fixity::Post) { + return Some(cell.clone()); + } + }, + 2 => + if let Some(OpDirValue(cell)) = op_dir.get(name, Fixity::In) { + return Some(cell.clone()); + }, + _ => {} + }; + + None +} + +pub fn get_op_desc(name: ClauseName, op_dir: &CompositeOpDir) -> Option +{ + let mut op_desc = OpDesc { pre: 0, inf: 0, post: 0, spec: 0 }; + + if let Some(OpDirValue(cell)) = op_dir.get(name.clone(), Fixity::Pre) { + let (pri, spec) = cell.get(); + + if pri > 0 { + op_desc.pre = pri; + op_desc.spec |= spec; + } else if name.as_str() == "-" { + op_desc.spec |= NEGATIVE_SIGN; + } + } + + if let Some(OpDirValue(cell)) = op_dir.get(name.clone(), Fixity::Post) { + let (pri, spec) = cell.get(); + + if pri > 0 { + op_desc.post = pri; + op_desc.spec |= spec; + } + } + + if let Some(OpDirValue(cell)) = op_dir.get(name.clone(), Fixity::In) { + let (pri, spec) = cell.get(); + + if pri > 0 { + op_desc.inf = pri; + op_desc.spec |= spec; + } + } + + if op_desc.pre + op_desc.post + op_desc.inf == 0 && !is_negate!(op_desc.spec) { + None + } else { + Some(op_desc) + } +} + +fn affirm_xfx(priority: usize, d2: TokenDesc, d3: TokenDesc, d1: TokenDesc) -> bool +{ + d2.priority <= priority + && is_term!(d3.spec) + && is_term!(d1.spec) + && d3.priority < d2.priority + && d1.priority < d2.priority +} + +fn affirm_yfx(priority: usize, d2: TokenDesc, d3: TokenDesc, d1: TokenDesc) -> bool +{ + d2.priority <= priority + && ((is_term!(d3.spec) && d3.priority < d2.priority) + || (is_lterm!(d3.spec) && d3.priority == d2.priority)) + && is_term!(d1.spec) + && d1.priority < d2.priority +} + + +fn affirm_xfy(priority: usize, d2: TokenDesc, d3: TokenDesc, d1: TokenDesc) -> bool +{ + d2.priority < priority + && is_term!(d3.spec) + && d3.priority < d2.priority + && is_term!(d1.spec) + && d1.priority <= d2.priority +} + +fn affirm_yf(d1: TokenDesc, d2: TokenDesc) -> bool +{ + let is_valid_lterm = is_lterm!(d2.spec) && d2.priority == d1.priority; + (is_term!(d2.spec) && d2.priority < d1.priority) || is_valid_lterm +} + +fn affirm_xf(d1: TokenDesc, d2: TokenDesc) -> bool +{ + is_term!(d2.spec) && d2.priority < d1.priority +} + +fn affirm_fy(priority: usize, d1: TokenDesc, d2: TokenDesc) -> bool +{ + d2.priority < priority && is_term!(d1.spec) && d1.priority <= d2.priority +} + +fn affirm_fx(priority: usize, d1: TokenDesc, d2: TokenDesc) -> bool +{ + d2.priority <= priority && is_term!(d1.spec) && d1.priority < d2.priority +} + +fn sep_to_atom(tt: TokenType) -> Option +{ + match tt { + TokenType::Open | TokenType::OpenCT => + Some(clause_name!("(")), + TokenType::Close => + Some(clause_name!(")")), + TokenType::OpenList => + Some(clause_name!("[")), + TokenType::CloseList => + Some(clause_name!("]")), + TokenType::OpenCurly => + Some(clause_name!("{")), + TokenType::CloseCurly => + Some(clause_name!("}")), + TokenType::HeadTailSeparator => + Some(clause_name!("|")), + TokenType::Comma => + Some(clause_name!(",")), + TokenType::End => + Some(clause_name!(".")), + _ => None + } +} + +#[derive(Debug, Clone, Copy)] +pub struct OpDesc { + pub pre: usize, + pub inf: usize, + pub post: usize, + pub spec: Specifier +} + +#[derive(Debug)] +pub struct Parser<'a, R: Read> { + lexer: Lexer<'a, R>, + tokens: Vec, + stack: Vec, + terms: Vec, +} + +fn read_tokens<'a, R: Read>(lexer: &mut Lexer<'a, R>) -> Result, ParserError> +{ + let mut tokens = vec![]; + + loop { + let token = lexer.next_token()?; + let at_end = Token::End == token; + + tokens.push(token); + + if at_end { + break; + } + } + + tokens.reverse(); + + Ok(tokens) +} + +impl<'a, R: Read> Parser<'a, R> { + pub fn new( + stream: &'a mut ParsingStream, + atom_tbl: TabledData, + flags: MachineFlags, + ) -> Self { + Parser { lexer: Lexer::new(atom_tbl, flags, stream), + tokens: vec![], + stack: Vec::new(), + terms: Vec::new() } + } + + #[inline] + pub fn line_num(&self) -> usize { + self.lexer.line_num + } + + #[inline] + pub fn col_num(&self) -> usize { + self.lexer.col_num + } + + #[inline] + pub fn get_atom_tbl(&self) -> TabledData { + self.lexer.atom_tbl.clone() + } + + #[inline] + pub fn set_atom_tbl(&mut self, atom_tbl: TabledData) { + self.lexer.atom_tbl = atom_tbl; + } + + fn get_term_name(&mut self, td: TokenDesc) -> Option<(ClauseName, Option)> { + match td.tt { + TokenType::HeadTailSeparator => { + Some((clause_name!("|"), Some(SharedOpDesc::new(td.priority, td.spec)))) + } + TokenType::Comma => { + Some((clause_name!(","), Some(SharedOpDesc::new(1000, XFY)))) + } + TokenType::Term => { + match self.terms.pop() { + Some(Term::Constant(_, Constant::Atom(atom, spec))) => + Some((atom, spec)), + Some(term) => { + self.terms.push(term); + None + }, + _ => None + } + } + _ => { + None + } + } + } + + fn push_binary_op(&mut self, td: TokenDesc, spec: Specifier) + { + if let Some(arg2) = self.terms.pop() { + if let Some((name, shared_op_desc)) = self.get_term_name(td) { + if let Some(arg1) = self.terms.pop() { + let term = Term::Clause(Cell::default(), + name, + vec![Box::new(arg1), Box::new(arg2)], + shared_op_desc); + + self.terms.push(term); + self.stack.push(TokenDesc { tt: TokenType::Term, + priority: td.priority, + spec }); + } + } + } + } + + fn push_unary_op(&mut self, td: TokenDesc, spec: Specifier, assoc: u32) + { + if let Some(mut arg1) = self.terms.pop() { + if let Some(mut name) = self.terms.pop() { + if is_postfix!(assoc) { + swap(&mut arg1, &mut name); + } + + if let Term::Constant(_, Constant::Atom(name, shared_op_desc)) = name { + let term = Term::Clause(Cell::default(), name, vec![Box::new(arg1)], + shared_op_desc); + + self.terms.push(term); + self.stack.push(TokenDesc { tt: TokenType::Term, + priority: td.priority, + spec }); + } + } + } + } + + fn promote_atom_op(&mut self, atom: ClauseName, priority: usize, assoc: u32, + op_dir_val: Option<&OpDirValue>) + { + let spec = op_dir_val.map(|op_dir_val| op_dir_val.shared_op_desc()); + + self.terms.push(Term::Constant(Cell::default(), Constant::Atom(atom, spec))); + self.stack.push(TokenDesc { tt: TokenType::Term, priority, spec: assoc }); + } + + fn shift(&mut self, token: Token, priority: usize, spec: Specifier) + { + let tt = match token { + Token::Constant(Constant::String(s)) + if self.lexer.flags.double_quotes.is_codes() => { + let mut list = Term::Constant(Cell::default(), Constant::EmptyList); + + for c in s.chars().rev() { + list = Term::Cons( + Cell::default(), + Box::new(Term::Constant( + Cell::default(), + Constant::Fixnum(c as isize), + )), + Box::new(list), + ); + } + + self.terms.push(list); + TokenType::Term + } + Token::Constant(c) => { + self.terms.push(Term::Constant(Cell::default(), c)); + TokenType::Term + }, + Token::Var(v) => { + if v.trim() == "_" { + self.terms.push(Term::AnonVar); + } else { + self.terms.push(Term::Var(Cell::default(), v)); + } + + TokenType::Term + }, + Token::Comma => TokenType::Comma, + Token::Open => TokenType::Open, + Token::Close => TokenType::Close, + Token::OpenCT => TokenType::OpenCT, + Token::HeadTailSeparator => TokenType::HeadTailSeparator, + Token::OpenList => TokenType::OpenList, + Token::CloseList => TokenType::CloseList, + Token::OpenCurly => TokenType::OpenCurly, + Token::CloseCurly => TokenType::CloseCurly, + Token::End => TokenType::End, + }; + + self.stack.push(TokenDesc { tt, priority, spec }); + } + + fn reduce_op(&mut self, priority: usize) { + loop { + if let Some(desc1) = self.stack.pop() { + if let Some(desc2) = self.stack.pop() { + if let Some(desc3) = self.stack.pop() { + if is_xfx!(desc2.spec) && affirm_xfx(priority, desc2, desc3, desc1) + { + self.push_binary_op(desc2, LTERM); + continue; + } + else if is_yfx!(desc2.spec) && affirm_yfx(priority, desc2, desc3, desc1) + { + self.push_binary_op(desc2, LTERM); + continue; + } + else if is_xfy!(desc2.spec) && affirm_xfy(priority, desc2, desc3, desc1) + { + self.push_binary_op(desc2, TERM); + continue; + } else { + self.stack.push(desc3); + } + } + + if is_yf!(desc1.spec) && affirm_yf(desc1, desc2) { + self.push_unary_op(desc1, LTERM, YF); + continue; + } else if is_xf!(desc1.spec) && affirm_xf(desc1, desc2) { + self.push_unary_op(desc1, LTERM, XF); + continue; + } else if is_fy!(desc2.spec) && affirm_fy(priority, desc1, desc2) { + self.push_unary_op(desc2, TERM, FY); + continue; + } else if is_fx!(desc2.spec) && affirm_fx(priority, desc1, desc2) { + self.push_unary_op(desc2, TERM, FX); + continue; + } else { + self.stack.push(desc2); + self.stack.push(desc1); + } + } else { + self.stack.push(desc1); + } + } + + break; + } + } + + fn compute_arity_in_brackets(&self) -> Option + { + let mut arity = 0; + + for (i, desc) in self.stack.iter().rev().enumerate() { + if i % 2 == 0 { // expect a term or non-comma operator. + if let TokenType::Comma = desc.tt { + return None; + } else if is_term!(desc.spec) || is_op!(desc.spec) || is_negate!(desc.spec) { + arity += 1; + } else { + return None; + } + } else { + if desc.tt == TokenType::OpenCT { + return Some(arity); + } + + if let TokenType::Comma = desc.tt { + continue; + } else { + return None; + } + } + } + + None + } + + fn reduce_term(&mut self, op_dir: &CompositeOpDir) -> bool + { + if self.stack.is_empty() { + return false; + } + + self.reduce_op(999); + + let arity = match self.compute_arity_in_brackets() { + Some(arity) => arity, + None => return false + }; + + if self.stack.len() > 2 * arity { + let idx = self.stack.len() - 2 * arity - 1; + + if is_infix!(self.stack[idx].spec) && idx > 0 { + if !is_op!(self.stack[idx - 1].spec) && !self.stack[idx - 1].tt.is_sep() { + return false; + } + } + + if arity >= 2 && is_prefix!(self.stack[idx].spec) && self.stack[idx].priority > 0 { + return false; + } + } else { + return false; + } + + let stack_len = self.stack.len() - 2 * arity - 1; + let idx = self.terms.len() - arity; + + if TokenType::Term == self.stack[stack_len].tt { + if self.atomize_term(&self.terms[idx - 1]).is_some() { + self.stack.truncate(stack_len + 1); + + let mut subterms: Vec<_> = self.terms.drain(idx ..) + .map(|t| Box::new(t)) + .collect(); + + if let Some(name) = self.terms.pop().and_then(|t| self.atomize_term(&t)) { + // reduce the '.' functor to a cons cell if it applies. + if name.as_str() == "." && subterms.len() == 2 { + let tail = subterms.pop().unwrap(); + let head = subterms.pop().unwrap(); + + self.terms.push(Term::Cons(Cell::default(), head, tail)); + } else { + let spec = get_clause_spec(name.clone(), subterms.len(), op_dir); + self.terms.push(Term::Clause(Cell::default(), name, subterms, spec)); + } + + if let Some(&mut TokenDesc { ref mut priority, ref mut spec, + ref mut tt }) = self.stack.last_mut() + { + *tt = TokenType::Term; + *priority = 0; + *spec = TERM; + } + + return true; + } + } + } + + false + } + + pub fn devour_whitespace(&mut self) -> Result<(), ParserError> { + self.lexer.scan_for_layout()?; + Ok(()) + } + + pub fn reset(&mut self) { + self.stack.clear() + } + + fn expand_comma_compacted_terms(&mut self, index: usize) -> usize + { + if let Some(term) = self.terms.pop() { + let op_desc = self.stack[index - 1]; + + if 0 < op_desc.priority && op_desc.priority < self.stack[index].priority { + /* '|' is a head-tail separator here, not + * an operator, so expand the + * terms it compacted out again. */ + match (term.name(), term.arity()) { + (Some(name), 2) if name.as_str() == "," => { + let terms = unfold_by_str(term, ","); + let arity = terms.len() - 1; + + self.terms.extend(terms.into_iter()); + return arity; + } + _ => { + } + } + } + + self.terms.push(term); + } + + 0 + } + + fn compute_arity_in_list(&self) -> Option + { + let mut arity = 0; + + for (i, desc) in self.stack.iter().rev().enumerate() { + if i % 2 == 0 { // expect a term or non-comma operator. + if let TokenType::Comma = desc.tt { + return None; + } else if is_term!(desc.spec) || is_op!(desc.spec) { + arity += 1; + } else { + return None; + } + } else { + if desc.tt == TokenType::HeadTailSeparator { + if arity == 1 { + continue; + } + + return None; + } else if desc.tt == TokenType::OpenList { + return Some(arity); + } else if desc.tt != TokenType::Comma { + return None; + } + } + } + + None + } + + fn reduce_list(&mut self) -> Result + { + if self.stack.is_empty() { + return Ok(false); + } + + if let Some(ref mut td) = self.stack.last_mut() { + if td.tt == TokenType::OpenList { + td.spec = TERM; + td.tt = TokenType::Term; + td.priority = 0; + + self.terms.push(Term::Constant(Cell::default(), Constant::EmptyList)); + return Ok(true); + } + } + + self.reduce_op(1000); + + let mut arity = match self.compute_arity_in_list() { + Some(arity) => arity, + None => return Ok(false) + }; + + // we know that self.stack.len() >= 2 by this point. + let idx = self.stack.len() - 2; + let list_len = self.stack.len() - 2 * arity; + + let end_term = if self.stack[idx].tt != TokenType::HeadTailSeparator { + Term::Constant(Cell::default(), Constant::EmptyList) + } else { + let term = + match self.terms.pop() { + Some(term) => term, + _ => return Err(ParserError::IncompleteReduction(self.lexer.line_num, + self.lexer.col_num)) + }; + + if self.stack[idx].priority > 1000 { + arity += self.expand_comma_compacted_terms(idx); + } + + arity -= 1; + + term + }; + + let idx = self.terms.len() - arity; + + let list = self.terms.drain(idx ..) + .rev() + .fold(end_term, |acc, t| Term::Cons(Cell::default(), + Box::new(t), + Box::new(acc))); + + self.stack.truncate(list_len); + + self.stack.push(TokenDesc { tt: TokenType::Term, priority: 0, spec: TERM }); + self.terms.push(list); + + Ok(true) + } + + fn reduce_curly(&mut self) -> Result { + if self.stack.is_empty() { + return Ok(false); + } + + if let Some(ref mut td) = self.stack.last_mut() { + if td.tt == TokenType::OpenCurly { + td.tt = TokenType::Term; + td.priority = 0; + td.spec = TERM; + + let term = Term::Constant(Cell::default(), + atom!("{}", self.lexer.atom_tbl)); + self.terms.push(term); + return Ok(true); + } + } + + self.reduce_op(1201); + + if self.stack.len() > 1 { + if let Some(td) = self.stack.pop() { + if let Some(ref mut oc) = self.stack.last_mut() { + if td.tt != TokenType::Term { + return Ok(false); + } + + if oc.tt == TokenType::OpenCurly { + oc.tt = TokenType::Term; + oc.priority = 0; + oc.spec = TERM; + + let term = match self.terms.pop() { + Some(term) => term, + _ => return Err(ParserError::IncompleteReduction( + self.lexer.line_num, + self.lexer.col_num, + )) + }; + + self.terms.push(Term::Clause( + Cell::default(), + clause_name!("{}"), + vec![Box::new(term)], + None + )); + + return Ok(true); + } + } + } + } + + Ok(false) + } + + fn reduce_brackets(&mut self) -> bool { + if self.stack.is_empty() { + return false; + } + + self.reduce_op(1400); + + if self.stack.len() == 1 { + return false; + } + + let idx = self.stack.len() - 2; + + match self.stack.remove(idx) { + td => + match td.tt { + TokenType::Open | TokenType::OpenCT => { + if self.stack[idx].tt == TokenType::Comma { + return false; + } + + if let Some(atom) = sep_to_atom(self.stack[idx].tt) { + self.terms.push(Term::Constant(Cell::default(), Constant::Atom(atom, None))); + } + + self.stack[idx].spec = TERM; + self.stack[idx].tt = TokenType::Term; + self.stack[idx].priority = 0; + true + }, + _ => false + } + } + } + + fn shift_op(&mut self, name: ClauseName, op_dir: &CompositeOpDir) -> Result { + if let Some(OpDesc { pre, inf, post, spec }) = get_op_desc(name.clone(), op_dir) { + if (pre > 0 && inf + post > 0) || is_negate!(spec) { + match self.tokens.last().ok_or(ParserError::UnexpectedEOF)? { + // do this when layout hasn't been inserted, + // ie. why we don't match on Token::Open. + &Token::OpenCT => { + // can't be prefix, so either inf == 0 + // or post == 0. + self.reduce_op(inf + post); + + let fixity = if inf > 0 { Fixity::In } else { Fixity::Post }; + let op_dir_val = op_dir.get(name.clone(), fixity); + + self.promote_atom_op( + name, + inf + post, + spec & (XFX | XFY | YFX | YF | XF), + op_dir_val, + ); + }, + _ => { + self.reduce_op(inf + post); + + if let Some(TokenDesc { spec: pspec, .. }) = self.stack.last().cloned() { + // rterm.c: 412 + if is_term!(pspec) { + let fixity = if inf > 0 { Fixity::In } else { Fixity::Post }; + let op_dir_val = op_dir.get(name.clone(), fixity); + + self.promote_atom_op( + name, + inf + post, + spec & (XFX | XFY | YFX | XF | YF), + op_dir_val, + ); + } else { + let op_dir_val = op_dir.get(name.clone(), Fixity::Pre); + self.promote_atom_op(name, pre, spec & (FX | FY | NEGATIVE_SIGN), op_dir_val); + } + } else { + let op_dir_val = op_dir.get(name.clone(), Fixity::Pre); + self.promote_atom_op(name, pre, spec & (FX | FY | NEGATIVE_SIGN), op_dir_val); + } + } + } + } else { + let op_dir_val = op_dir.get( + name.clone(), + if pre + inf == 0 { + Fixity::Post + } else if post + pre == 0 { + Fixity::In + } else { + Fixity::Pre + }, + ); + + self.reduce_op(pre + inf + post); // only one non-zero priority among these. + self.promote_atom_op(name, pre + inf + post, spec, op_dir_val); + } + + Ok(true) + } else { // not an operator. + Ok(false) + } + } + + fn atomize_term(&self, term: &Term) -> Option { + match term { + &Term::Constant(_, ref c) => self.atomize_constant(c), + _ => None + } + } + + fn atomize_constant(&self, c: &Constant) -> Option { + match c { + &Constant::Atom(ref name, _) => Some(name.clone()), + &Constant::Char(c) => + Some(clause_name!(c.to_string(), self.lexer.atom_tbl)), + &Constant::EmptyList => + Some(clause_name!(c.to_string(), self.lexer.atom_tbl)), + _ => None + } + } + + fn negate_number( + &mut self, + n: N, + negator: Negator, + constr: ToConstant + ) + where Negator: Fn(N) -> N, + ToConstant: Fn(N) -> Constant + { + if let Some(desc) = self.stack.last().cloned() { + if let Some(term) = self.terms.last().cloned() { + match term { + Term::Constant(_, Constant::Atom(ref name, _)) + if name.as_str() == "-" && (is_prefix!(desc.spec) || is_negate!(desc.spec)) => { + self.stack.pop(); + self.terms.pop(); + + self.shift(Token::Constant(constr(negator(n))), 0, TERM); + return; + }, + _ => {} + } + } + } + + self.shift(Token::Constant(constr(n)), 0, TERM); + } + + fn shift_token(&mut self, token: Token, op_dir: &CompositeOpDir) -> Result<(), ParserError> { + fn negate_rc(mut t: Rc) -> Rc { + match Rc::get_mut(&mut t) { + Some(t) => { + t.neg_assign(); + } + None => { + } + }; + + t + } + + match token { + Token::Constant(Constant::Fixnum(n)) => + self.negate_number(n, |n| -n, Constant::Fixnum), + Token::Constant(Constant::Integer(n)) => + self.negate_number(n, negate_rc, Constant::Integer), + Token::Constant(Constant::Rational(n)) => + self.negate_number(n, negate_rc, Constant::Rational), + Token::Constant(Constant::Float(n)) => + self.negate_number( + n, + |n| OrderedFloat(-n.into_inner()), + |n| Constant::Float(n) + ), + Token::Constant(c) => + if let Some(name) = self.atomize_constant(&c) { + if !self.shift_op(name, op_dir)? { + self.shift(Token::Constant(c), 0, TERM); + } + } else { + self.shift(Token::Constant(c), 0, TERM); + }, + Token::Var(v) => self.shift(Token::Var(v), 0, TERM), + Token::Open => self.shift(Token::Open, 1300, DELIMITER), + Token::OpenCT => self.shift(Token::OpenCT, 1300, DELIMITER), + Token::Close => + if !self.reduce_term(op_dir) { + if !self.reduce_brackets() { + return Err(ParserError::IncompleteReduction( + self.lexer.line_num, + self.lexer.col_num, + )); + } + }, + Token::OpenList => self.shift(Token::OpenList, 1300, DELIMITER), + Token::CloseList => + if !self.reduce_list()? { + return Err(ParserError::IncompleteReduction( + self.lexer.line_num, + self.lexer.col_num, + )); + }, + Token::OpenCurly => self.shift(Token::OpenCurly, 1300, DELIMITER), + Token::CloseCurly => + if !self.reduce_curly()? { + return Err(ParserError::IncompleteReduction( + self.lexer.line_num, + self.lexer.col_num, + )); + }, + Token::HeadTailSeparator => { + /* '|' as an operator must have priority > 1000 and can only be infix. + * See: http://www.complang.tuwien.ac.at/ulrich/iso-prolog/dtc2#Res_A78 + */ + let (priority, spec) = get_op_desc(clause_name!("|"), op_dir) + .map(|OpDesc { inf, spec, .. }| (inf, spec)) + .unwrap_or((1000, DELIMITER)); + + self.reduce_op(priority); + self.shift(Token::HeadTailSeparator, priority, spec); + }, + Token::Comma => { + self.reduce_op(1000); + self.shift(Token::Comma, 1000, XFY); + }, + Token::End => + match self.stack.last().map(|t| t.tt) { + Some(TokenType::Open) + | Some(TokenType::OpenCT) + | Some(TokenType::OpenList) + | Some(TokenType::OpenCurly) + | Some(TokenType::HeadTailSeparator) + | Some(TokenType::Comma) + => return Err(ParserError::IncompleteReduction(self.lexer.line_num, + self.lexer.col_num)), + _ => {} + } + } + + Ok(()) + } + + #[inline] + pub fn eof(&mut self) -> Result { + self.lexer.eof() + } + + pub fn read_term(&mut self, op_dir: &CompositeOpDir) -> Result + { + self.tokens = read_tokens(&mut self.lexer)?; + + while let Some(token) = self.tokens.pop() { + self.shift_token(token, op_dir)?; + } + + self.reduce_op(1400); + + if self.terms.len() > 1 || self.stack.len() > 1 { + return Err(ParserError::IncompleteReduction(self.lexer.line_num, self.lexer.col_num)); + } + + match self.terms.pop() { + Some(term) => if self.terms.is_empty() { + Ok(term) + } else { + Err(ParserError::IncompleteReduction(self.lexer.line_num, self.lexer.col_num)) + }, + _ => Err(ParserError::IncompleteReduction(self.lexer.line_num, self.lexer.col_num)) + } + } + + pub fn read(&mut self, op_dir: &CompositeOpDir) -> Result, ParserError> + { + let mut terms = Vec::new(); + + loop { + terms.push(self.read_term(op_dir)?); + + if self.lexer.eof()? { + break; + } + } + + Ok(terms) + } +} diff --git a/crates/prolog_parser/src/put_back_n.rs b/crates/prolog_parser/src/put_back_n.rs new file mode 100644 index 00000000..8bef7f30 --- /dev/null +++ b/crates/prolog_parser/src/put_back_n.rs @@ -0,0 +1,71 @@ +use std::iter::Peekable; + +#[derive(Debug, Clone)] +pub struct PutBackN { + top: Vec, + iter: Peekable, +} + +pub fn put_back_n(iterable: I) -> PutBackN + where I: IntoIterator +{ + PutBackN { + top: Vec::new(), + iter: iterable.into_iter().peekable(), + } +} + +impl PutBackN { + #[inline] + pub(crate) + fn put_back(&mut self, item: I::Item) { + self.top.push(item); + } + + #[inline] + pub fn take_buf(&mut self) -> Vec { + std::mem::replace(&mut self.top, vec![]) + } + + #[inline] + pub(crate) + fn peek(&mut self) -> Option<&I::Item> { + if self.top.is_empty() { + /* This is a kludge for Ctrl-D not being + * handled properly if self.iter().peek() isn't called + * first. */ + match self.iter.peek() { + Some(_) => { + self.iter.next().and_then(move |item| { + self.top.push(item); + self.top.last() + }) + } + None => { + None + } + } + } else { + self.top.last() + } + } + + #[inline] + pub(crate) + fn put_back_all>(&mut self, iter: DEI) { + self.top.extend(iter.rev()); + } +} + +impl Iterator for PutBackN { + type Item = I::Item; + + #[inline] + fn next(&mut self) -> Option { + if self.top.is_empty() { + self.iter.next() + } else { + self.top.pop() + } + } +} diff --git a/crates/prolog_parser/src/tabled_rc.rs b/crates/prolog_parser/src/tabled_rc.rs new file mode 100644 index 00000000..271ad3bf --- /dev/null +++ b/crates/prolog_parser/src/tabled_rc.rs @@ -0,0 +1,153 @@ +use std::cell::{RefCell, RefMut}; +use std::cmp::Ordering; +use std::collections::HashSet; +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::ops::Deref; +use std::rc::{Rc}; + +pub struct TabledData { + table: Rc>>>, + pub(crate) module_name: Rc +} + +impl fmt::Debug for TabledData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TabledData") + .field("table", &self.table) + .field("module_name", &self.table) + .finish() + } +} + +impl Clone for TabledData { + fn clone(&self) -> Self { + TabledData { table: self.table.clone(), + module_name: self.module_name.clone() } + } +} + +impl PartialEq for TabledData { + fn eq(&self, other: &TabledData) -> bool + { + Rc::ptr_eq(&self.table, &other.table) && self.module_name == other.module_name + } +} + +impl TabledData { + #[inline] + pub fn new(module_name: Rc) -> Self { + TabledData { + table: Rc::new(RefCell::new(HashSet::new())), + module_name + } + } + + #[inline] + pub fn borrow_mut(&self) -> RefMut>> { + self.table.borrow_mut() + } +} + +pub struct TabledRc { + pub(crate) atom: Rc, + pub table: TabledData +} + +impl fmt::Debug for TabledRc { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("TabledRc") + .field("atom", &self.atom) + .field("table", &self.table) + .finish() + } +} + +// this Clone instance is manually defined to prevent the compiler +// from complaining when deriving Clone for StringList. +impl Clone for TabledRc { + fn clone(&self) -> Self { + TabledRc { atom: self.atom.clone(), table: self.table.clone() } + } +} + +impl PartialOrd for TabledRc { + fn partial_cmp(&self, other: &Self) -> Option + { + Some(self.atom.cmp(&other.atom)) + } +} + +impl Ord for TabledRc { + fn cmp(&self, other: &Self) -> Ordering + { + self.atom.cmp(&other.atom) + } +} + +impl PartialEq for TabledRc { + fn eq(&self, other: &TabledRc) -> bool + { + self.atom == other.atom + } +} + +impl Eq for TabledRc {} + +impl Hash for TabledRc { + fn hash(&self, state: &mut H) { + self.atom.hash(state) + } +} + +impl TabledRc { + pub fn new(atom: T, table: TabledData) -> Self { + let atom = match table.borrow_mut().take(&atom) { + Some(atom) => atom.clone(), + None => Rc::new(atom) + }; + + table.borrow_mut().insert(atom.clone()); + + TabledRc { atom, table } + } + + #[inline] + pub fn inner(&self) -> Rc { + self.atom.clone() + } + + #[inline] + pub(crate) fn owning_module(&self) -> Rc { + self.table.module_name.clone() + } +} + +impl Drop for TabledRc { + fn drop(&mut self) { + if Rc::strong_count(&self.atom) == 2 { + self.table.borrow_mut().remove(&self.atom); + } + } +} + +impl Deref for TabledRc { + type Target = T; + + fn deref(&self) -> &Self::Target { + &*self.atom + } +} + +impl fmt::Display for TabledRc { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", &*self.atom) + } +} + +#[macro_export] +macro_rules! tabled_rc { + ($e:expr, $tbl:expr) => ( + TabledRc::new(String::from($e), $tbl.clone()) + ) +} -- 2.54.0