Reworked byte position <-> LSP position conversions (#33)

p4lang · Jun 5, 2023 · 5be8774 · 5be8774
1 parent eff1daf
commit 5be8774
Show file tree

Hide file tree

Showing 11 changed files with 666 additions and 85 deletions.
diff --git a/crates/analyzer-core/benches/lexer.rs b/crates/analyzer-core/benches/lexer.rs
@@ -1,6 +1,6 @@
 extern crate analyzer_core;
 
-use analyzer_core::*;
+use analyzer_core::{lsp_file::LspFile, *};
 use base_abstractions::*;
 use lexer::*;
 
@@ -11,7 +11,7 @@ fn baseline(input: String) -> Vec<char> { input.chars().into_iter().collect() }
 
 fn basic(input: String) -> Vec<(Token, Span)> {
 	let db = Database::new(|base, _| Ok(base.into()));
-	let buf = Buffer::new(&db, input);
+	let buf = Buffer::from_string(&db, &input);
 	let file_id = FileId::new(&db, "foo".to_string());
 	let lexed = lex(&db, file_id, buf);
 	lexed.lexemes(&db).clone()

diff --git a/crates/analyzer-core/src/base_abstractions.rs b/crates/analyzer-core/src/base_abstractions.rs
@@ -1,10 +1,19 @@
 pub use logos::Span;
 
+use crate::{lsp_file::LspFile, Database};
+
 /// The input buffer.
 #[salsa::input]
 pub struct Buffer {
 	#[return_ref]
-	pub contents: String,
+	pub file: LspFile,
+}
+
+impl Buffer {
+	pub fn from_string(db: &Database, file: &String) -> Buffer {
+		let lsp_file = LspFile::new(file);
+		Buffer::new(db, lsp_file)
+	}
 }
 
 #[salsa::interned]

diff --git a/crates/analyzer-core/src/lib.rs b/crates/analyzer-core/src/lib.rs
@@ -1,5 +1,6 @@
 pub mod base_abstractions;
 pub mod lexer;
+pub mod lsp_file;
 pub mod parser;
 pub mod preprocessor;
 
@@ -9,6 +10,7 @@ use logos::Logos;
 
 use base_abstractions::*;
 use lexer::*;
+use lsp_file::{ChangeEvent, LspFile};
 use preprocessor::*;
 
 // #[derive(Default)]
@@ -77,15 +79,28 @@ impl Analyzer {
 
 	fn filesystem(&self) -> HashMap<FileId, Buffer> { self.fs.map(|fs| fs.fs(&self.db)).unwrap_or_default() }
 
-	pub fn update(&mut self, file_id: FileId, input: String) {
+	pub fn file_change_event(&mut self, file_id: FileId, event_vec: &Vec<ChangeEvent>) {
 		let mut filesystem = self.filesystem();
-		filesystem.insert(file_id, Buffer::new(&self.db, input));
+
+		// TODO: avoid cloning
+		let mut lsp_file = self.get_file(file_id).clone();
+		for event in event_vec {
+			lsp_file.lazy_add(event);
+		}
+
+		filesystem.insert(file_id, Buffer::new(&self.db, lsp_file));
+		self.fs = Fs::new(&self.db, filesystem).into();
+	}
+
+	pub fn update(&mut self, file_id: FileId, input: &String) {
+		let mut filesystem = self.filesystem();
+		filesystem.insert(file_id, Buffer::from_string(&self.db, input));
 		self.fs = Fs::new(&self.db, filesystem).into();
 	}
 
 	pub fn input(&self, file_id: FileId) -> Option<&str> {
 		let buffer = self.buffer(file_id)?;
-		Some(buffer.contents(&self.db))
+		Some(buffer.file(&self.db).get_file_content())
 	}
 
 	pub fn buffer(&self, file_id: FileId) -> Option<Buffer> { self.filesystem().get(&file_id).copied() }
@@ -136,6 +151,8 @@ impl Analyzer {
 	pub fn path(&self, id: FileId) -> String { id.path(&self.db) }
 
 	pub fn files(&self) -> Vec<String> { self.filesystem().keys().map(|k| k.path(&self.db)).collect() }
+
+	pub fn get_file(&self, id: FileId) -> &LspFile { self.buffer(id).unwrap().file(&self.db) }
 }
 
 // TODO: trait for workspace logic?
@@ -210,7 +227,7 @@ where
 
 #[salsa::tracked(return_ref)]
 pub fn lex(db: &dyn crate::Db, file_id: FileId, buf: Buffer) -> LexedBuffer {
-	let contents = buf.contents(db);
+	let contents = buf.file(db).get_file_content();
 	let lexer = {
 		let db = unsafe { std::mem::transmute(db) };
 		Token::lexer_with_extras(contents, Lextras { db: Some(db), file_id })

diff --git a/crates/analyzer-core/src/lsp_file.rs b/crates/analyzer-core/src/lsp_file.rs
@@ -0,0 +1,244 @@
+use logos::Source; // for slice
+
+#[derive(Debug, Eq, PartialEq, PartialOrd, Copy, Clone, Default)]
+pub struct Position {
+	pub line: usize, // lsp_types uses u32
+	pub character: usize,
+}
+
+impl Position {
+	pub fn new(line: usize, character: usize) -> Position { Position { line, character } }
+}
+
+#[derive(Debug, Eq, PartialEq, Copy, Clone, Default)]
+pub struct Range {
+	pub start: Position,
+	pub end: Position,
+}
+
+impl Range {
+	pub fn new(start: Position, end: Position) -> Range { Range { start, end } }
+}
+
+pub struct ChangeEvent {
+	pub range: Option<Range>,
+	pub text: String,
+}
+
+#[derive(Clone, Debug)]
+pub struct LspFile {
+	file: String,       // File content, use OsString?
+	ranges: Vec<usize>, // Each element represents the line, last byte position (see test_parse_file())
+}
+
+impl LspFile {
+	pub fn new(file: &String) -> Self {
+		let ranges = LspFile::parse_string(&file);
+		LspFile { file: file.clone(), ranges }
+	}
+
+	pub fn get_file_content(&self) -> &String { &self.file }
+
+	pub fn get_ranges(&self) -> &Vec<usize> { &self.ranges }
+
+	// helper function
+	fn parse_string(string: &String) -> Vec<usize> {
+		let mut result: Vec<usize> = Vec::new();
+		if string.is_empty() {
+			return result;
+		}
+
+		let chars = string.chars();
+		let mut byte_count = 0;
+		for (_, c) in chars.enumerate() {
+			byte_count += c.len_utf8();
+			if c == '\n' {
+				result.push(byte_count - 1);
+			}
+		}
+
+		// If there are bytes left add it to vector
+		if *result.last().unwrap_or(&(usize::MAX - 1)) != byte_count - 1 {
+			result.push(byte_count - 1);
+		}
+
+		result
+	}
+
+	// used to get a valid lsp position for the current file
+	fn lsp_to_lsp(&self, lsp_pos: &Position) -> Position { self.byte_to_lsp(self.lsp_to_byte(lsp_pos)) }
+
+	pub fn line_char(&self, line: usize) -> usize {
+		// a useful case to expect
+		if line >= self.ranges.len() {
+			return 0; // no characters exist on that line
+		}
+
+		let start_pos = if line == 0 { 0 } else { self.ranges.get(line - 1).unwrap_or(&0) + 1 };
+
+		let slice = self.file.slice(start_pos..self.ranges[line] + 1).unwrap_or("");
+		slice.chars().count()
+	}
+
+	pub fn lsp_to_byte(&self, lsp_pos: &Position) -> usize {
+		// O(1) time complexity
+		// file is empty
+		if self.ranges.is_empty() {
+			return 0;
+		}
+
+		// line greater than contain, return last byte + 1
+		if lsp_pos.line >= self.ranges.len() {
+			return *self.ranges.last().unwrap() + 1;
+		}
+
+		let start_byte = if lsp_pos.line == 0 { 0 } else { self.ranges.get(lsp_pos.line - 1).unwrap_or(&0) + 1 };
+
+		// get byte offset for character position in line
+		let slice = self.file.slice(start_byte..self.ranges[lsp_pos.line]).unwrap_or("").chars();
+		let mut byte_count = 0;
+		for (i, c) in slice.enumerate() {
+			if i == lsp_pos.character {
+				break;
+			}
+			byte_count += c.len_utf8();
+		}
+
+		start_byte + byte_count
+	}
+
+	pub fn lsp_range_to_byte_range(&self, lsp_range: &Range) -> std::ops::Range<usize> {
+		let start = self.lsp_to_byte(&lsp_range.start);
+		let end = self.lsp_to_byte(&lsp_range.end);
+		start..end
+	}
+
+	// O(log(n))
+	pub fn byte_to_lsp(&self, byte_pos: usize) -> Position {
+		// byte position greater than end of current file
+		if self.ranges.is_empty() {
+			return Position { line: 0, character: 0 };
+		}
+
+		if byte_pos > *self.ranges.last().unwrap_or(&0) {
+			return Position { line: self.ranges.len(), character: 0 }; // return next position of last line
+		}
+
+		let line = self.ranges.binary_search(&byte_pos).unwrap_or_else(|x| x);
+
+		// calculate character position in byte offset
+		let mut byte_count = if line == 0 { 0 } else { self.ranges[line - 1] + 1 };
+		let slice = self.file.slice(byte_count..self.ranges[line]).unwrap_or("").chars();
+		let mut char = slice.clone().count();
+
+		for (i, c) in slice.enumerate() {
+			byte_count += c.len_utf8();
+			if byte_count > byte_pos {
+				char = i;
+				break;
+			}
+		}
+
+		Position { line, character: char }
+	}
+
+	pub fn byte_range_to_lsp_range(&self, byte_range: &std::ops::Range<usize>) -> Range {
+		let start = self.byte_to_lsp(byte_range.start);
+		let end = self.byte_to_lsp(byte_range.end);
+		Range { start, end }
+	}
+
+	// used to update ranges from TextDocumentContentChangeEvent
+	// will lazily add as only parse the text to be added
+	// optimal for large files with small changes
+	pub fn lazy_add(&mut self, changes: &ChangeEvent) {
+		// The whole file got changes || file was empty, so reparse as new file
+		if changes.range.is_none() || self.ranges.is_empty() {
+			*self = LspFile::new(&changes.text);
+			return;
+		}
+
+		// calculate position in current file
+		let start_pos = self.lsp_to_lsp(&changes.range.unwrap().start); // inclusive
+		let end_pos_exc = self.lsp_to_lsp(&changes.range.unwrap().end); // exclusive
+
+		// undefined behaviour
+		if start_pos > end_pos_exc {
+			panic!(
+				"range.start: {:?} is greater than range.end: {:?} in TextDocumentContentChangeEvent!",
+				start_pos, end_pos_exc
+			)
+		}
+
+		// parse input
+		let mut additional_ranges = LspFile::parse_string(&changes.text);
+		let addition_byte: i64 = additional_ranges.last().map_or(-1, |value| *value as i64);
+
+		// align additions to their placement in current file
+		let start_byte = self.lsp_to_byte(&start_pos);
+		let end_byte = self.lsp_to_byte(&end_pos_exc);
+		for elm in &mut additional_ranges {
+			*elm += start_byte;
+		}
+
+		// caching frequent conversions and calculation
+		let mut start_line = start_pos.line;
+		let end_line = end_pos_exc.line;
+		let range_size = self.ranges.len();
+
+		// need to make addition calculations for head and tail of new additions
+		let tailing_end_bytes = self.lsp_to_byte(&Position { line: end_line + 1, character: 0 }) - end_byte;
+
+		// special cases if change text is empty
+		if additional_ranges.is_empty() {
+			let end_line_byte = *self.ranges.get(end_line).unwrap_or(self.ranges.last().unwrap());
+			let val = end_line_byte.wrapping_sub(end_byte).wrapping_add(start_byte) as i64;
+			// we're deleteing the whole file
+			if val < 0 {
+				self.file.clear();
+				self.ranges.clear();
+				return;
+			}
+
+			// The case for deleting nothing to end of file
+			if start_line == range_size {
+				return;
+			}
+
+			// The change is just a deletion
+			if tailing_end_bytes != 0 || start_pos.character != 0 {
+				additional_ranges.push(val as usize);
+			}
+		} else {
+			// \n is our line break, if adding to end of file don't make duplicate range
+			if changes.text.chars().last() == Some('\n') && end_line != range_size {
+				additional_ranges.push(*additional_ranges.last().unwrap());
+			}
+			*additional_ranges.last_mut().unwrap() += tailing_end_bytes;
+		}
+
+		// we're adding to end of file
+		// if it doesn't has eof flag then merge addition onto end
+		// if it does add a new index
+		if start_line == range_size && self.file.chars().last() != Some('\n') {
+			start_line -= 1;
+		}
+
+		// update file
+		let range = start_byte..end_byte;
+		//info!("replacing range {:?} of {:?} with {:?}", range, &self.file[range.clone()], &changes.text);
+		self.file.replace_range(range, &changes.text);
+
+		// remove old ranges and add new ranges
+		let len = additional_ranges.len();
+		let s = (start_line).min(range_size);
+		let e = (end_line + 1).min(range_size);
+		self.ranges.splice(s..e, additional_ranges); // used for performance benefits
+
+		// realignment of tail end of old ranges
+		let diff = (addition_byte + 1) - (end_byte as i64 - start_byte as i64);
+		for elm in self.ranges.iter_mut().skip(start_line + len) {
+			*elm = (*elm as i64 + diff) as usize;
+		}
+	}
+}
diff --git a/crates/analyzer-core/src/preprocessor.rs b/crates/analyzer-core/src/preprocessor.rs
@@ -572,7 +572,7 @@ mod test {
 		let mut pp = PreprocessorState::new(|path| FileId::new(&db, path.into()), |_| unreachable!());
 
 		let test_id = FileId::new(&db, "<test-code>.p4".into());
-		let input = Buffer::new(&db, s.into());
+		let input = Buffer::from_string(&db, &s.to_string());
 		let lexed = lex(&db, test_id, input);
 		let mut lexemes = lexed.lexemes(&db).iter().cloned().map(|(tk, span)| (test_id, tk, span)).collect();
 

diff --git a/crates/analyzer-core/tests/lexer.rs b/crates/analyzer-core/tests/lexer.rs
@@ -7,7 +7,7 @@ use pretty_assertions::assert_eq;
 
 fn lex_str(s: &str) -> Vec<Token> {
 	let db = Database::new(|base, _| Ok(base.into()));
-	let buf = Buffer::new(&db, s.to_string());
+	let buf = Buffer::from_string(&db, &s.to_string());
 	let file_id = FileId::new(&db, "foo.p4".to_string());
 	let lexed = lex(&db, file_id, buf);
 	lexed.lexemes(&db).iter().map(|(tk, _)| tk).cloned().collect()