Workaround for faulty unicode_width values

After a bit of analysis work, I believe the unicode_width library is returning incorrect widths for certain characters (i.e. returning a width of 0, when the width should be 1). This patch adds a workaround for that in the comp() function, alongside a long comment documentating the issue with more resources and tools. Signed-off-by: Rory Dudley <[email protected]>
author: Rory Dudley 2024-12-16 15:57:00 -0700
committer: Rory Dudley 2024-12-16 16:03:41 -0700
commit: ac5152886fedc9b65fbb874c67711cd5227d2e00 (patch)
tree: 66abb5ab7e9cc52b83e6a2b718e86e4c23daed16 /src/buffer.rs
parent: 604b148d94b2b48cf2ad1608b5a2d322e341de3c (diff)
download: dwarvish-ac5152886fedc9b65fbb874c67711cd5227d2e00.tar.gz
1 files changed, 42 insertions, 7 deletions
diff --git a/src/buffer.rs b/src/buffer.rs
index 08c8653..5771668 100644
--- a/src/buffer.rs
+++ b/src/buffer.rs
@@ -165,8 +165,48 @@ fn comp(
         *bpos -= *len;
     }
 
-    let ori_path: String = buffer[*bpos..].into_iter().collect::<String>();
-    let mut width = UnicodeWidthStr::width(ori_path.as_str());
+    // This bit of code is annoying, but it appears that the unicode_width library is (incorrectly)
+    // computing some character's width to be 0. So, we need to loop through each character from
+    // the last path, and if unicode_width says its width is 0, add 1 to it.
+    //
+    // The string I have been testing with is: "01\ ｴﾚｸﾄﾘｯｸ･ﾊﾟﾌﾞﾘｯｸ.flac".
+    // There are some unicode characters with a width of zero
+    // (https://unicode-explorer.com/articles/space-characters), however, I have analyzed the raw
+    // byte sequence of all the characters in the string above, and I do not believe that any of
+    // them are zero width chars.
+    //
+    // In theory, this workaround may mess up completion for strings that DO have real zero width
+    // characters. However, as those characters seem to be mostly for typesetting, I am not going
+    // to worry about it, unless I run into it myself, or someone complains to me about it.
+    //
+    // Here is a simple ruby script that might help anyone looking to do their own analysis:
+    // #!/usr/bin/env ruby
+    // # frozen_string_literal: true
+    //
+    // str = '01\ ｴﾚｸﾄﾘｯｸ･ﾊﾟﾌﾞﾘｯｸ.flac'
+    // puts "len: #{str.length}"
+    //
+    // str.bytes do |b|
+    //   # puts b
+    //   puts b if b == 191
+    // end
+    //
+    // puts
+    // # oth = "hi\u200chi"
+    // oth = "hi\ufeffhi"
+    // puts oth
+    // puts oth.bytes do |b|
+    //   puts b
+    // end
+    let ori_path = buffer[*bpos..].into_iter().collect::<Vec<_>>();
+    let mut width = 0;
+    for c in ori_path.clone() {
+        let mut w = UnicodeWidthChar::width(*c).unwrap_or(1);
+        if w == 0 {
+            w += 1;
+        }
+        width += w;
+    }
 
     // Remove the last autocomplete value from the buffer
     while *len > 0 {
@@ -299,11 +339,6 @@ fn comp(
         path.file_name().unwrap().to_string_lossy()[word.len()..].to_string()
     };
 
-    // Reset from previous autocomplete
-    (0..*len).for_each(|_| print!("\u{8}"));
-    (0..*len).for_each(|_| print!(" "));
-    (0..*len).for_each(|_| print!("\u{8}"));
-
     let mut j = 0;
     let mut chars = path.chars().collect::<Vec<char>>();
     for (i, c) in chars.clone().iter().enumerate() {
author	Rory Dudley	2024-12-16 15:57:00 -0700
committer	Rory Dudley	2024-12-16 16:03:41 -0700
commit	ac5152886fedc9b65fbb874c67711cd5227d2e00 (patch)
tree	66abb5ab7e9cc52b83e6a2b718e86e4c23daed16 /src/buffer.rs
parent	604b148d94b2b48cf2ad1608b5a2d322e341de3c (diff)
download	dwarvish-ac5152886fedc9b65fbb874c67711cd5227d2e00.tar.gz