nearly as fast as wc and now yield words

This commit is contained in:
Yehowshua Immanuel 2022-06-02 20:02:09 -04:00
parent 08c5058208
commit 43563d0d7c

View file

@ -11,32 +11,32 @@ use clap::Parser;
use std::slice; use std::slice;
use std::str; use std::str;
use std::collections::VecDeque;
#[derive(Parser)] #[derive(Parser)]
struct Cli { struct Cli {
/// The path to the file to read /// The path to the file to read
#[clap(parse(from_os_str))] #[clap(parse(from_os_str))]
path: std::path::PathBuf} path: std::path::PathBuf}
// TODO: implement any timescales greater than a second
#[derive(Debug)]
enum Timescale {ps, ns, us, ms, s, unit}
#[derive(Debug)]
struct Scope_Idx(usize);
#[derive(Debug)]
struct Signal_Idx(usize);
#[derive(Debug)] #[derive(Debug)]
struct Version(String); struct Version(String);
#[derive(Debug)]
enum Timescale {ps, ns, us, ms, s, unit}
#[derive(Debug)] #[derive(Debug)]
struct Metadata { struct Metadata {
date : Option<DateTime<Utc>>, date : Option<DateTime<Utc>>,
version : Option<Version>, version : Option<Version>,
timescale : (Option<u32>, Timescale)} timescale : (Option<u32>, Timescale)}
#[derive(Debug)]
struct Scope_Idx(usize);
#[derive(Debug)]
struct Signal_Idx(usize);
#[derive(Debug)] #[derive(Debug)]
enum SignalGeneric{ enum SignalGeneric{
Signal{ Signal{
@ -62,32 +62,6 @@ struct VCD {
// the root scope should always be placed at index 0 // the root scope should always be placed at index 0
all_scopes : Vec<Scope>} all_scopes : Vec<Scope>}
#[derive(Debug, PartialEq)]
enum Date_Parser_State {Begin, Parsing}
#[derive(Debug, PartialEq)]
enum Version_Parser_State {Begin, Parsing}
#[derive(Debug, PartialEq)]
enum Timescale_Parser_State {Begin, Parsing}
#[derive(Debug, PartialEq)]
enum Signal_Tree_Parser_State {Begin, Parsing}
#[derive(Debug, PartialEq)]
enum Parser_State {
Date(Date_Parser_State),
Version(Version_Parser_State),
Timescale(Timescale_Parser_State),
Signal_Tree(Signal_Tree_Parser_State),
Parse_Signal_Values}
struct VCD_Parser<'a> {
vcd_parser_state : Parser_State,
buffer : Option<String>,
vcd : &'a mut VCD,
curr_scope : Option<&'a Scope>,
curr_parent_scope : Option<&'a Scope>}
impl VCD { impl VCD {
pub fn new() -> Self { pub fn new() -> Self {
let metadata = Metadata { let metadata = Metadata {
@ -101,275 +75,20 @@ impl VCD {
} }
} }
impl<'a> VCD_Parser<'a> {
pub fn new(vcd : &'a mut VCD) -> Self {
VCD_Parser {
vcd_parser_state : Parser_State::Date(Date_Parser_State::Begin),
buffer : None, #[derive(Debug)]
vcd : vcd, struct Line(usize);
curr_scope : None, #[derive(Debug)]
curr_parent_scope : None struct Word(usize);
} #[derive(Debug)]
} struct Cursor(Line, Word);
pub fn parse_word(&mut self, word : &str) -> Result<(), String> {
let mut state = &mut self.vcd_parser_state;
let t = &self.vcd;
match state {
Parser_State::Date(_) => self.parse_date(word),
Parser_State::Version(_) => self.parse_version(word),
Parser_State::Timescale(_) => self.parse_timescale(word),
// TODO : Enable the following in production
// _ => Err(format!("parser in bad state : {state:?}"))
// TODO : Disable the following in production
_ => {
Err(format!("parser in bad state : {state:?}; {t:?}"))
}
}
}
#[named]
pub fn parse_date(&mut self, word : &str) -> Result<(), String> {
let mut state = &mut self.vcd_parser_state;
match state {
Parser_State::Date(Date_Parser_State::Begin) =>
match word {
"$date" => {
*state = Parser_State::Date(Date_Parser_State::Parsing);
Ok(())
}
_ => {
*state = Parser_State::Version(Version_Parser_State::Begin);
self.parse_version(word)
}
}
Parser_State::Date(Date_Parser_State::Parsing) =>
match word {
"$end" => {
let s = self.buffer.take().unwrap();
let dt = Utc.datetime_from_str(s.as_str(), "%a %b %e %T %Y")
.expect(&format!("invalid date {s}").as_str());
*state = Parser_State::Version(Version_Parser_State::Begin);
self.vcd.metadata.date = Some(dt);
Ok(())
}
_ => {
if let Some(ref mut buffer) = self.buffer {
buffer.push_str(" ");
buffer.push_str(word);
}
else {
self.buffer = Some(word.to_string());
}
Ok(())
}
}
_ => Err(format!("{state:?} should be unreachable within {}.",function_name!())),
}
}
#[named]
pub fn parse_statement(
&'a mut self,
curr_word : &str,
key_word : &str,
begin_state : Parser_State,
parsing_state : Parser_State,
end_state : Parser_State,
next_parser : fn(&'a mut VCD_Parser, &str) -> Result<(), String>
) -> Result<(), String> {
let mut state = &mut self.vcd_parser_state;
if (*state == begin_state) {
return match curr_word {
key_word => {
*state = Parser_State::Date(Date_Parser_State::Parsing);
Ok(())
}
_ => {
*state = Parser_State::Version(Version_Parser_State::Begin);
next_parser(self, curr_word)
}
}
}
else {
Ok(())
}
// Ok(())
// match state {
// Parser_State::Date(Date_Parser_State::Begin) =>
// match curr_word {
// key_word => {
// *state = Parser_State::Date(Date_Parser_State::Parsing);
// Ok(())
// }
// _ => {
// *state = Parser_State::Version(Version_Parser_State::Begin);
// self.parse_version(curr_word)
// }
// }
// Parser_State::Date(Date_Parser_State::Parsing) =>
// match curr_word {
// "$end" => {
// let s = self.buffer.take().unwrap();
// let dt = Utc.datetime_from_str(s.as_str(), "%a %b %e %T %Y")
// .expect(&format!("invalid date {s}").as_str());
// *state = Parser_State::Version(Version_Parser_State::Begin);
// self.vcd.metadata.date = Some(dt);
// Ok(())
// }
// _ => {
// if let Some(ref mut buffer) = self.buffer {
// buffer.push_str(" ");
// buffer.push_str(curr_word);
// }
// else {
// self.buffer = Some(curr_word.to_string());
// }
// Ok(())
// }
// }
// _ => Err(format!("{state:?} should be unreachable within {}.",function_name!())),
// }
}
#[named]
pub fn parse_version(&mut self, word : &str) -> Result<(), String> {
let mut state = &mut self.vcd_parser_state;
match state {
Parser_State::Version(Version_Parser_State::Begin) =>
match word {
"$version" => {
*state = Parser_State::Version(Version_Parser_State::Parsing);
Ok(())
}
_ => {
*state = Parser_State::Timescale(Timescale_Parser_State::Begin);
Ok(())
}
}
Parser_State::Version(Version_Parser_State::Parsing) =>
match word {
"$end" => {
let s = self.buffer.take().unwrap();
self.vcd.metadata.version = Some(Version(s));
*state = Parser_State::Timescale(Timescale_Parser_State::Begin);
Ok(())
}
_ => {
if let Some(ref mut buffer) = self.buffer {
buffer.push_str(" ");
buffer.push_str(word);
}
else {
self.buffer = Some(word.to_string());
}
Ok(())
}
}
_ => Err(format!("{state:?} should be unreachable within {}.",function_name!())),
}
}
#[named]
pub fn parse_timescale(&mut self, word : &str) -> Result<(), String> {
let mut state = &mut self.vcd_parser_state;
match state {
Parser_State::Timescale(Timescale_Parser_State::Begin) =>
match word {
"$timescale" => {
*state = Parser_State::Timescale(Timescale_Parser_State::Parsing);
Ok(())
}
_ => {
*state = Parser_State::Signal_Tree(Signal_Tree_Parser_State::Begin);
Ok(())
}
}
Parser_State::Timescale(Timescale_Parser_State::Parsing) =>
match word {
"$end" => {
let s = self.buffer.take().unwrap();
let s = s.split_ascii_whitespace();
let s = s.collect::<Vec<&str>>();
let scalar = s[0].to_string().parse::<u32>().unwrap();
let unit = s[1];
let unit = match unit {
"ps" => Ok(Timescale::ps),
"ns" => Ok(Timescale::ns),
"us" => Ok(Timescale::us),
"ms" => Ok(Timescale::ms),
"s" => Ok(Timescale::s),
// TODO : see if there is a way to easily print out all enum variants
// _ => Err(format!("{word} is not a valid unit of time in {Timescale}"))
_ => Err(format!("{unit} is not a valid unit"))
}.unwrap();
dbg!(s);
self.vcd.metadata.timescale = (Some(scalar), unit);
*state = Parser_State::Timescale(Timescale_Parser_State::Begin);
Ok(())
}
_ => {
if let Some(ref mut buffer) = self.buffer {
buffer.push_str(" ");
buffer.push_str(word);
}
else {
self.buffer = Some(word.to_string());
}
Ok(())
}
}
_ => Err(format!("{state:?} should be unreachable within {}.",function_name!())),
}
}
}
struct Line(u32);
struct Col(u32);
struct Position(Line, Col);
fn yield_word_and_apply(file : File, mut f : impl FnMut(&[u8], Position) -> Result<(), String>) {
let mut reader = io::BufReader::new(file);
let mut buffer = String::new();
let mut line = 0u32;
while true {
let bytes_read = reader.read_line(&mut buffer).unwrap();
if bytes_read == 0 {break}
line += 1;
let mut col = 1u32;
let mut words = buffer.split_ascii_whitespace();
for word in words {
let word = word.as_bytes();
let position = Position(Line(line), Col(col));
f(word, position).unwrap();
col += (word.len() as u32) + 1;
}
buffer.clear();
}
}
struct YieldByWord { struct YieldByWord {
reader : io::BufReader<File>, reader : io::BufReader<File>,
words : Vec<String>,
EOF : bool, EOF : bool,
buffer : String, buffers : Vec<String>,
str_slices : Vec<(*const u8, usize)>, curr_line : usize,
str_slices : VecDeque<(*const u8, usize, Cursor)>,
} }
impl YieldByWord { impl YieldByWord {
@ -377,41 +96,57 @@ impl YieldByWord {
let mut reader = io::BufReader::new(file); let mut reader = io::BufReader::new(file);
YieldByWord { YieldByWord {
reader : reader, reader : reader,
words : vec![],
EOF : false, EOF : false,
buffer : "".to_string(), buffers : vec![],
str_slices : vec![], curr_line : 0,
str_slices : VecDeque::new()
} }
} }
fn next_word(&mut self) -> Option<&str> { fn next_word(&mut self) -> Option<(&str, Cursor)> {
// if there are no more words, attempt to read more content // if there are no more words, attempt to read more content
// from the file // from the file
if self.str_slices.is_empty() { if self.str_slices.is_empty() {
self.buffer.clear(); self.buffers.clear();
if self.EOF {return None} if self.EOF {return None}
let line_chunk_size = 10; let num_buffers = 10;
for _ in 0..line_chunk_size { for buf_idx in 0..num_buffers {
let bytes_read = self.reader.read_line(&mut self.buffer).unwrap(); self.buffers.push(String::new());
// we hit the end of the file, so we go ahead and return None self.curr_line += 1;
if bytes_read == 0 {self.EOF = true} let bytes_read = self.reader.read_line(&mut self.buffers[buf_idx]).unwrap();
// if we've reached the end of the file on the first attempt to read
// a line in this for loop, no further attempts are necessary and we
if bytes_read == 0 {
self.EOF = true;
break;
} }
let words = self.buffer.split_ascii_whitespace(); let mut words = self.buffers[buf_idx].split_ascii_whitespace();
self.str_slices = words
.rev() for word in words.enumerate() {
.map(|s| (s.as_ptr(), s.len())) let (word_idx, word) = word;
.collect(); let position = Cursor(Line(self.curr_line), Word(word_idx + 1));
self.str_slices.push_back((word.as_ptr(), word.len(), position))
}
}
}
// if after we've attempted to read in more content from the file,
// there are still no words...
if self.str_slices.is_empty() {
return None
} }
// if we make it here, we return the next word // if we make it here, we return the next word
unsafe { unsafe {
let (ptr, len) = self.str_slices.pop().unwrap(); let (ptr, len, position) = self.str_slices.pop_front().unwrap();
let slice = slice::from_raw_parts(ptr, len); let slice = slice::from_raw_parts(ptr, len);
return Some(str::from_utf8(slice).unwrap()); return Some((str::from_utf8(slice).unwrap(), position));
}; };
} }
} }
@ -422,27 +157,19 @@ fn main() -> std::io::Result<()> {
let file = File::open(&args.path)?; let file = File::open(&args.path)?;
let mut word_gen = YieldByWord::new(file); let mut word_gen = YieldByWord::new(file);
let mut word_count = 0; let mut word_count = 0;
let mut last_word = String::new();
// for word in 0..5 {
// dbg!(word_gen.next_word());
// }
while word_gen.next_word().is_some() { while word_gen.next_word().is_some() {
word_count += 1; word_count += 1;
} }
dbg!(word_count); dbg!(word_count);
// loop { // loop {
// let next_word = word_gen.next_word(); // let word = word_gen.next_word();
// if next_word.is_some() { // if word.is_none() {break};
// last_word = next_word.unwrap();
// } // dbg!(word.unwrap());
// else {
// break
// }
// } // }
// dbg!(last_word);
Ok(()) Ok(())
} }