nearly as fast as wc and now yield words
This commit is contained in:
parent
08c5058208
commit
43563d0d7c
387
src/main.rs
387
src/main.rs
|
@ -11,32 +11,32 @@ use clap::Parser;
|
|||
use std::slice;
|
||||
use std::str;
|
||||
|
||||
use std::collections::VecDeque;
|
||||
|
||||
#[derive(Parser)]
|
||||
struct Cli {
|
||||
/// The path to the file to read
|
||||
#[clap(parse(from_os_str))]
|
||||
path: std::path::PathBuf}
|
||||
|
||||
|
||||
// TODO: implement any timescales greater than a second
|
||||
#[derive(Debug)]
|
||||
enum Timescale {ps, ns, us, ms, s, unit}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Scope_Idx(usize);
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Signal_Idx(usize);
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Version(String);
|
||||
|
||||
#[derive(Debug)]
|
||||
enum Timescale {ps, ns, us, ms, s, unit}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Metadata {
|
||||
date : Option<DateTime<Utc>>,
|
||||
version : Option<Version>,
|
||||
timescale : (Option<u32>, Timescale)}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Scope_Idx(usize);
|
||||
|
||||
#[derive(Debug)]
|
||||
struct Signal_Idx(usize);
|
||||
|
||||
#[derive(Debug)]
|
||||
enum SignalGeneric{
|
||||
Signal{
|
||||
|
@ -62,32 +62,6 @@ struct VCD {
|
|||
// the root scope should always be placed at index 0
|
||||
all_scopes : Vec<Scope>}
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum Date_Parser_State {Begin, Parsing}
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum Version_Parser_State {Begin, Parsing}
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum Timescale_Parser_State {Begin, Parsing}
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum Signal_Tree_Parser_State {Begin, Parsing}
|
||||
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
enum Parser_State {
|
||||
Date(Date_Parser_State),
|
||||
Version(Version_Parser_State),
|
||||
Timescale(Timescale_Parser_State),
|
||||
Signal_Tree(Signal_Tree_Parser_State),
|
||||
Parse_Signal_Values}
|
||||
|
||||
struct VCD_Parser<'a> {
|
||||
vcd_parser_state : Parser_State,
|
||||
buffer : Option<String>,
|
||||
|
||||
vcd : &'a mut VCD,
|
||||
curr_scope : Option<&'a Scope>,
|
||||
curr_parent_scope : Option<&'a Scope>}
|
||||
|
||||
impl VCD {
|
||||
pub fn new() -> Self {
|
||||
let metadata = Metadata {
|
||||
|
@ -101,275 +75,20 @@ impl VCD {
|
|||
}
|
||||
}
|
||||
|
||||
impl<'a> VCD_Parser<'a> {
|
||||
pub fn new(vcd : &'a mut VCD) -> Self {
|
||||
VCD_Parser {
|
||||
vcd_parser_state : Parser_State::Date(Date_Parser_State::Begin),
|
||||
|
||||
buffer : None,
|
||||
vcd : vcd,
|
||||
curr_scope : None,
|
||||
curr_parent_scope : None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn parse_word(&mut self, word : &str) -> Result<(), String> {
|
||||
let mut state = &mut self.vcd_parser_state;
|
||||
let t = &self.vcd;
|
||||
match state {
|
||||
Parser_State::Date(_) => self.parse_date(word),
|
||||
Parser_State::Version(_) => self.parse_version(word),
|
||||
Parser_State::Timescale(_) => self.parse_timescale(word),
|
||||
// TODO : Enable the following in production
|
||||
// _ => Err(format!("parser in bad state : {state:?}"))
|
||||
// TODO : Disable the following in production
|
||||
_ => {
|
||||
Err(format!("parser in bad state : {state:?}; {t:?}"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[named]
|
||||
pub fn parse_date(&mut self, word : &str) -> Result<(), String> {
|
||||
let mut state = &mut self.vcd_parser_state;
|
||||
match state {
|
||||
Parser_State::Date(Date_Parser_State::Begin) =>
|
||||
match word {
|
||||
"$date" => {
|
||||
*state = Parser_State::Date(Date_Parser_State::Parsing);
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
*state = Parser_State::Version(Version_Parser_State::Begin);
|
||||
self.parse_version(word)
|
||||
}
|
||||
}
|
||||
Parser_State::Date(Date_Parser_State::Parsing) =>
|
||||
match word {
|
||||
"$end" => {
|
||||
let s = self.buffer.take().unwrap();
|
||||
let dt = Utc.datetime_from_str(s.as_str(), "%a %b %e %T %Y")
|
||||
.expect(&format!("invalid date {s}").as_str());
|
||||
*state = Parser_State::Version(Version_Parser_State::Begin);
|
||||
self.vcd.metadata.date = Some(dt);
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
if let Some(ref mut buffer) = self.buffer {
|
||||
buffer.push_str(" ");
|
||||
buffer.push_str(word);
|
||||
}
|
||||
else {
|
||||
self.buffer = Some(word.to_string());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
_ => Err(format!("{state:?} should be unreachable within {}.",function_name!())),
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#[named]
|
||||
pub fn parse_statement(
|
||||
&'a mut self,
|
||||
curr_word : &str,
|
||||
key_word : &str,
|
||||
begin_state : Parser_State,
|
||||
parsing_state : Parser_State,
|
||||
end_state : Parser_State,
|
||||
next_parser : fn(&'a mut VCD_Parser, &str) -> Result<(), String>
|
||||
) -> Result<(), String> {
|
||||
let mut state = &mut self.vcd_parser_state;
|
||||
|
||||
if (*state == begin_state) {
|
||||
return match curr_word {
|
||||
key_word => {
|
||||
*state = Parser_State::Date(Date_Parser_State::Parsing);
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
*state = Parser_State::Version(Version_Parser_State::Begin);
|
||||
next_parser(self, curr_word)
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
Ok(())
|
||||
}
|
||||
// Ok(())
|
||||
|
||||
// match state {
|
||||
// Parser_State::Date(Date_Parser_State::Begin) =>
|
||||
// match curr_word {
|
||||
// key_word => {
|
||||
// *state = Parser_State::Date(Date_Parser_State::Parsing);
|
||||
// Ok(())
|
||||
// }
|
||||
// _ => {
|
||||
// *state = Parser_State::Version(Version_Parser_State::Begin);
|
||||
// self.parse_version(curr_word)
|
||||
// }
|
||||
// }
|
||||
// Parser_State::Date(Date_Parser_State::Parsing) =>
|
||||
// match curr_word {
|
||||
// "$end" => {
|
||||
// let s = self.buffer.take().unwrap();
|
||||
// let dt = Utc.datetime_from_str(s.as_str(), "%a %b %e %T %Y")
|
||||
// .expect(&format!("invalid date {s}").as_str());
|
||||
// *state = Parser_State::Version(Version_Parser_State::Begin);
|
||||
// self.vcd.metadata.date = Some(dt);
|
||||
// Ok(())
|
||||
// }
|
||||
// _ => {
|
||||
// if let Some(ref mut buffer) = self.buffer {
|
||||
// buffer.push_str(" ");
|
||||
// buffer.push_str(curr_word);
|
||||
// }
|
||||
// else {
|
||||
// self.buffer = Some(curr_word.to_string());
|
||||
// }
|
||||
// Ok(())
|
||||
// }
|
||||
// }
|
||||
// _ => Err(format!("{state:?} should be unreachable within {}.",function_name!())),
|
||||
|
||||
// }
|
||||
}
|
||||
|
||||
#[named]
|
||||
pub fn parse_version(&mut self, word : &str) -> Result<(), String> {
|
||||
let mut state = &mut self.vcd_parser_state;
|
||||
match state {
|
||||
Parser_State::Version(Version_Parser_State::Begin) =>
|
||||
match word {
|
||||
"$version" => {
|
||||
*state = Parser_State::Version(Version_Parser_State::Parsing);
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
*state = Parser_State::Timescale(Timescale_Parser_State::Begin);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Parser_State::Version(Version_Parser_State::Parsing) =>
|
||||
match word {
|
||||
"$end" => {
|
||||
let s = self.buffer.take().unwrap();
|
||||
self.vcd.metadata.version = Some(Version(s));
|
||||
*state = Parser_State::Timescale(Timescale_Parser_State::Begin);
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
if let Some(ref mut buffer) = self.buffer {
|
||||
buffer.push_str(" ");
|
||||
buffer.push_str(word);
|
||||
}
|
||||
else {
|
||||
self.buffer = Some(word.to_string());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
_ => Err(format!("{state:?} should be unreachable within {}.",function_name!())),
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#[named]
|
||||
pub fn parse_timescale(&mut self, word : &str) -> Result<(), String> {
|
||||
let mut state = &mut self.vcd_parser_state;
|
||||
match state {
|
||||
Parser_State::Timescale(Timescale_Parser_State::Begin) =>
|
||||
match word {
|
||||
"$timescale" => {
|
||||
*state = Parser_State::Timescale(Timescale_Parser_State::Parsing);
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
*state = Parser_State::Signal_Tree(Signal_Tree_Parser_State::Begin);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Parser_State::Timescale(Timescale_Parser_State::Parsing) =>
|
||||
match word {
|
||||
"$end" => {
|
||||
let s = self.buffer.take().unwrap();
|
||||
let s = s.split_ascii_whitespace();
|
||||
let s = s.collect::<Vec<&str>>();
|
||||
|
||||
let scalar = s[0].to_string().parse::<u32>().unwrap();
|
||||
let unit = s[1];
|
||||
let unit = match unit {
|
||||
"ps" => Ok(Timescale::ps),
|
||||
"ns" => Ok(Timescale::ns),
|
||||
"us" => Ok(Timescale::us),
|
||||
"ms" => Ok(Timescale::ms),
|
||||
"s" => Ok(Timescale::s),
|
||||
// TODO : see if there is a way to easily print out all enum variants
|
||||
// _ => Err(format!("{word} is not a valid unit of time in {Timescale}"))
|
||||
_ => Err(format!("{unit} is not a valid unit"))
|
||||
}.unwrap();
|
||||
|
||||
dbg!(s);
|
||||
self.vcd.metadata.timescale = (Some(scalar), unit);
|
||||
*state = Parser_State::Timescale(Timescale_Parser_State::Begin);
|
||||
Ok(())
|
||||
}
|
||||
_ => {
|
||||
if let Some(ref mut buffer) = self.buffer {
|
||||
buffer.push_str(" ");
|
||||
buffer.push_str(word);
|
||||
}
|
||||
else {
|
||||
self.buffer = Some(word.to_string());
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
_ => Err(format!("{state:?} should be unreachable within {}.",function_name!())),
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct Line(u32);
|
||||
struct Col(u32);
|
||||
struct Position(Line, Col);
|
||||
|
||||
fn yield_word_and_apply(file : File, mut f : impl FnMut(&[u8], Position) -> Result<(), String>) {
|
||||
let mut reader = io::BufReader::new(file);
|
||||
|
||||
let mut buffer = String::new();
|
||||
|
||||
let mut line = 0u32;
|
||||
while true {
|
||||
let bytes_read = reader.read_line(&mut buffer).unwrap();
|
||||
if bytes_read == 0 {break}
|
||||
|
||||
line += 1;
|
||||
let mut col = 1u32;
|
||||
|
||||
let mut words = buffer.split_ascii_whitespace();
|
||||
|
||||
for word in words {
|
||||
let word = word.as_bytes();
|
||||
let position = Position(Line(line), Col(col));
|
||||
f(word, position).unwrap();
|
||||
col += (word.len() as u32) + 1;
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
}
|
||||
|
||||
}
|
||||
#[derive(Debug)]
|
||||
struct Line(usize);
|
||||
#[derive(Debug)]
|
||||
struct Word(usize);
|
||||
#[derive(Debug)]
|
||||
struct Cursor(Line, Word);
|
||||
|
||||
struct YieldByWord {
|
||||
reader : io::BufReader<File>,
|
||||
words : Vec<String>,
|
||||
EOF : bool,
|
||||
buffer : String,
|
||||
str_slices : Vec<(*const u8, usize)>,
|
||||
buffers : Vec<String>,
|
||||
curr_line : usize,
|
||||
str_slices : VecDeque<(*const u8, usize, Cursor)>,
|
||||
}
|
||||
|
||||
impl YieldByWord {
|
||||
|
@ -377,41 +96,57 @@ impl YieldByWord {
|
|||
let mut reader = io::BufReader::new(file);
|
||||
YieldByWord {
|
||||
reader : reader,
|
||||
words : vec![],
|
||||
EOF : false,
|
||||
buffer : "".to_string(),
|
||||
str_slices : vec![],
|
||||
buffers : vec![],
|
||||
curr_line : 0,
|
||||
str_slices : VecDeque::new()
|
||||
}
|
||||
}
|
||||
|
||||
fn next_word(&mut self) -> Option<&str> {
|
||||
fn next_word(&mut self) -> Option<(&str, Cursor)> {
|
||||
// if there are no more words, attempt to read more content
|
||||
// from the file
|
||||
if self.str_slices.is_empty() {
|
||||
self.buffer.clear();
|
||||
self.buffers.clear();
|
||||
|
||||
if self.EOF {return None}
|
||||
|
||||
let line_chunk_size = 10;
|
||||
let num_buffers = 10;
|
||||
|
||||
for _ in 0..line_chunk_size {
|
||||
let bytes_read = self.reader.read_line(&mut self.buffer).unwrap();
|
||||
// we hit the end of the file, so we go ahead and return None
|
||||
if bytes_read == 0 {self.EOF = true}
|
||||
for buf_idx in 0..num_buffers {
|
||||
self.buffers.push(String::new());
|
||||
self.curr_line += 1;
|
||||
let bytes_read = self.reader.read_line(&mut self.buffers[buf_idx]).unwrap();
|
||||
|
||||
// if we've reached the end of the file on the first attempt to read
|
||||
// a line in this for loop, no further attempts are necessary and we
|
||||
if bytes_read == 0 {
|
||||
self.EOF = true;
|
||||
break;
|
||||
}
|
||||
|
||||
let words = self.buffer.split_ascii_whitespace();
|
||||
self.str_slices = words
|
||||
.rev()
|
||||
.map(|s| (s.as_ptr(), s.len()))
|
||||
.collect();
|
||||
let mut words = self.buffers[buf_idx].split_ascii_whitespace();
|
||||
|
||||
for word in words.enumerate() {
|
||||
let (word_idx, word) = word;
|
||||
let position = Cursor(Line(self.curr_line), Word(word_idx + 1));
|
||||
self.str_slices.push_back((word.as_ptr(), word.len(), position))
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// if after we've attempted to read in more content from the file,
|
||||
// there are still no words...
|
||||
if self.str_slices.is_empty() {
|
||||
return None
|
||||
}
|
||||
|
||||
// if we make it here, we return the next word
|
||||
unsafe {
|
||||
let (ptr, len) = self.str_slices.pop().unwrap();
|
||||
let (ptr, len, position) = self.str_slices.pop_front().unwrap();
|
||||
let slice = slice::from_raw_parts(ptr, len);
|
||||
return Some(str::from_utf8(slice).unwrap());
|
||||
return Some((str::from_utf8(slice).unwrap(), position));
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -422,27 +157,19 @@ fn main() -> std::io::Result<()> {
|
|||
let file = File::open(&args.path)?;
|
||||
let mut word_gen = YieldByWord::new(file);
|
||||
let mut word_count = 0;
|
||||
let mut last_word = String::new();
|
||||
|
||||
// for word in 0..5 {
|
||||
// dbg!(word_gen.next_word());
|
||||
// }
|
||||
while word_gen.next_word().is_some() {
|
||||
word_count += 1;
|
||||
}
|
||||
dbg!(word_count);
|
||||
|
||||
// loop {
|
||||
// let next_word = word_gen.next_word();
|
||||
// if next_word.is_some() {
|
||||
// last_word = next_word.unwrap();
|
||||
// }
|
||||
// else {
|
||||
// break
|
||||
// }
|
||||
// let word = word_gen.next_word();
|
||||
// if word.is_none() {break};
|
||||
|
||||
// dbg!(word.unwrap());
|
||||
// }
|
||||
|
||||
// dbg!(last_word);
|
||||
|
||||
Ok(())
|
||||
}
|
Loading…
Reference in a new issue