Browse Source

Merge branch '3616_utf8_normalize_speedup'

* 3616_utf8_normalize_speedup:
  Ticket #3616: speed up of utf-8 normalization.
Andrew Borodin 7 years ago
parent
commit
37013e7db3
1 changed files with 19 additions and 0 deletions
  1. 19 0
      lib/strutil/strutilutf8.c

+ 19 - 0
lib/strutil/strutilutf8.c

@@ -1080,6 +1080,25 @@ str_utf8_normalize (const char *text)
     const char *start;
     const char *end;
 
+    /* g_utf8_normalize() is a heavyweight function, that converts UTF-8 into UCS-4,
+     * does the normalization and then converts UCS-4 back into UTF-8.
+     * Since file names are composed of ASCII characters in most cases, we can speed up
+     * utf8 normalization by checking if the heavyweight Unicode normalization is actually
+     * needed. Normalization of ASCII string is no-op.
+     */
+
+    /* find out whether text is ASCII only */
+    for (end = text; *end != '\0'; end++)
+        if ((*end & 0x80) != 0)
+        {
+            /* found 2nd byte of utf8-encoded symbol */
+            break;
+        }
+
+    /* if text is ASCII-only, return copy, normalize otherwise */
+    if (*end == '\0')
+        return g_strndup (text, end - text);
+
     fixed = g_string_sized_new (4);
 
     start = text;