summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorRory Dudley2024-12-16 15:57:00 -0700
committerRory Dudley2024-12-16 16:03:41 -0700
commitac5152886fedc9b65fbb874c67711cd5227d2e00 (patch)
tree66abb5ab7e9cc52b83e6a2b718e86e4c23daed16 /src
parent604b148d94b2b48cf2ad1608b5a2d322e341de3c (diff)
downloaddwarvish-ac5152886fedc9b65fbb874c67711cd5227d2e00.tar.gz
Workaround for faulty unicode_width values
After a bit of analysis work, I believe the unicode_width library is returning incorrect widths for certain characters (i.e. returning a width of 0, when the width should be 1). This patch adds a workaround for that in the comp() function, alongside a long comment documentating the issue with more resources and tools. Signed-off-by: Rory Dudley <rory@netc.lu>
Diffstat (limited to 'src')
-rw-r--r--src/buffer.rs49
1 files changed, 42 insertions, 7 deletions
diff --git a/src/buffer.rs b/src/buffer.rs
index 08c8653..5771668 100644
--- a/src/buffer.rs
+++ b/src/buffer.rs
@@ -165,8 +165,48 @@ fn comp(
*bpos -= *len;
}
- let ori_path: String = buffer[*bpos..].into_iter().collect::<String>();
- let mut width = UnicodeWidthStr::width(ori_path.as_str());
+ // This bit of code is annoying, but it appears that the unicode_width library is (incorrectly)
+ // computing some character's width to be 0. So, we need to loop through each character from
+ // the last path, and if unicode_width says its width is 0, add 1 to it.
+ //
+ // The string I have been testing with is: "01\ エレクトリック・パブリック.flac".
+ // There are some unicode characters with a width of zero
+ // (https://unicode-explorer.com/articles/space-characters), however, I have analyzed the raw
+ // byte sequence of all the characters in the string above, and I do not believe that any of
+ // them are zero width chars.
+ //
+ // In theory, this workaround may mess up completion for strings that DO have real zero width
+ // characters. However, as those characters seem to be mostly for typesetting, I am not going
+ // to worry about it, unless I run into it myself, or someone complains to me about it.
+ //
+ // Here is a simple ruby script that might help anyone looking to do their own analysis:
+ // #!/usr/bin/env ruby
+ // # frozen_string_literal: true
+ //
+ // str = '01\ エレクトリック・パブリック.flac'
+ // puts "len: #{str.length}"
+ //
+ // str.bytes do |b|
+ // # puts b
+ // puts b if b == 191
+ // end
+ //
+ // puts
+ // # oth = "hi\u200chi"
+ // oth = "hi\ufeffhi"
+ // puts oth
+ // puts oth.bytes do |b|
+ // puts b
+ // end
+ let ori_path = buffer[*bpos..].into_iter().collect::<Vec<_>>();
+ let mut width = 0;
+ for c in ori_path.clone() {
+ let mut w = UnicodeWidthChar::width(*c).unwrap_or(1);
+ if w == 0 {
+ w += 1;
+ }
+ width += w;
+ }
// Remove the last autocomplete value from the buffer
while *len > 0 {
@@ -299,11 +339,6 @@ fn comp(
path.file_name().unwrap().to_string_lossy()[word.len()..].to_string()
};
- // Reset from previous autocomplete
- (0..*len).for_each(|_| print!("\u{8}"));
- (0..*len).for_each(|_| print!(" "));
- (0..*len).for_each(|_| print!("\u{8}"));
-
let mut j = 0;
let mut chars = path.chars().collect::<Vec<char>>();
for (i, c) in chars.clone().iter().enumerate() {