diff options
author | Rory Dudley | 2024-12-16 15:57:00 -0700 |
---|---|---|
committer | Rory Dudley | 2024-12-16 16:03:41 -0700 |
commit | ac5152886fedc9b65fbb874c67711cd5227d2e00 (patch) | |
tree | 66abb5ab7e9cc52b83e6a2b718e86e4c23daed16 /src/buffer.rs | |
parent | 604b148d94b2b48cf2ad1608b5a2d322e341de3c (diff) | |
download | dwarvish-ac5152886fedc9b65fbb874c67711cd5227d2e00.tar.gz |
Workaround for faulty unicode_width values
After a bit of analysis work, I believe the unicode_width library is
returning incorrect widths for certain characters (i.e. returning a
width of 0, when the width should be 1). This patch adds a workaround
for that in the comp() function, alongside a long comment documentating
the issue with more resources and tools.
Signed-off-by: Rory Dudley <rory@netc.lu>
Diffstat (limited to 'src/buffer.rs')
-rw-r--r-- | src/buffer.rs | 49 |
1 files changed, 42 insertions, 7 deletions
diff --git a/src/buffer.rs b/src/buffer.rs index 08c8653..5771668 100644 --- a/src/buffer.rs +++ b/src/buffer.rs @@ -165,8 +165,48 @@ fn comp( *bpos -= *len; } - let ori_path: String = buffer[*bpos..].into_iter().collect::<String>(); - let mut width = UnicodeWidthStr::width(ori_path.as_str()); + // This bit of code is annoying, but it appears that the unicode_width library is (incorrectly) + // computing some character's width to be 0. So, we need to loop through each character from + // the last path, and if unicode_width says its width is 0, add 1 to it. + // + // The string I have been testing with is: "01\ エレクトリック・パブリック.flac". + // There are some unicode characters with a width of zero + // (https://unicode-explorer.com/articles/space-characters), however, I have analyzed the raw + // byte sequence of all the characters in the string above, and I do not believe that any of + // them are zero width chars. + // + // In theory, this workaround may mess up completion for strings that DO have real zero width + // characters. However, as those characters seem to be mostly for typesetting, I am not going + // to worry about it, unless I run into it myself, or someone complains to me about it. + // + // Here is a simple ruby script that might help anyone looking to do their own analysis: + // #!/usr/bin/env ruby + // # frozen_string_literal: true + // + // str = '01\ エレクトリック・パブリック.flac' + // puts "len: #{str.length}" + // + // str.bytes do |b| + // # puts b + // puts b if b == 191 + // end + // + // puts + // # oth = "hi\u200chi" + // oth = "hi\ufeffhi" + // puts oth + // puts oth.bytes do |b| + // puts b + // end + let ori_path = buffer[*bpos..].into_iter().collect::<Vec<_>>(); + let mut width = 0; + for c in ori_path.clone() { + let mut w = UnicodeWidthChar::width(*c).unwrap_or(1); + if w == 0 { + w += 1; + } + width += w; + } // Remove the last autocomplete value from the buffer while *len > 0 { @@ -299,11 +339,6 @@ fn comp( path.file_name().unwrap().to_string_lossy()[word.len()..].to_string() }; - // Reset from previous autocomplete - (0..*len).for_each(|_| print!("\u{8}")); - (0..*len).for_each(|_| print!(" ")); - (0..*len).for_each(|_| print!("\u{8}")); - let mut j = 0; let mut chars = path.chars().collect::<Vec<char>>(); for (i, c) in chars.clone().iter().enumerate() { |