← Back to team overview

kernel-packages team mailing list archive

[Bug 1214624] [NEW] False endianness in BOM-less UTF-16 strings

 

Public bug reported:

In some mp3 files,  ID3V2 tags are encoded in  UTF-16 LE  without BOM.
"libid3tag" defaults to BE. This false endianness creates "Mojibake" or pseudo "Chinese" characters.
We experience this problem in Audacity.
The following patch implements a reliable auto-detection of the endianness.

Best regards,

Joel Bouchat

Index: utf16.c
===================================================================
--- utf16.c	(revision 12463)
+++ utf16.c	(working copy)
@@ -256,17 +256,46 @@
     return 0;
 
   if (byteorder == ID3_UTF16_BYTEORDER_ANY && end - *ptr > 0) {
-    switch (((*ptr)[0] << 8) |
-	    ((*ptr)[1] << 0)) {
-    case 0xfeff:
-      byteorder = ID3_UTF16_BYTEORDER_BE;
-      *ptr += 2;
-      break;
+    switch (((*ptr)[0] << 8) | ((*ptr)[1] << 0)) {
+      case 0xfeff:
+        byteorder = ID3_UTF16_BYTEORDER_BE;
+        *ptr += 2;
+        break;
 
-    case 0xfffe:
-      byteorder = ID3_UTF16_BYTEORDER_LE;
-      *ptr += 2;
-      break;
+      case 0xfffe:
+        byteorder = ID3_UTF16_BYTEORDER_LE;
+        *ptr += 2;
+        break;
+
+      default: {
+        id3_length_t i; int nb0 = 0; int nb1 = 0;
+        byteorder = ID3_UTF16_BYTEORDER_BE; // defaults to Big Endian
+        // There is no BOM is this UTF16 string
+        // Figure out the byte order
+        for(i = 0; i < length/2; i+=2) {
+          id3_byte_t c0 = (*ptr)[i];
+          id3_byte_t c1 = (*ptr)[i+1];
+          if(c0 == 0x20 && c1 == 0x00) {
+            // LE space character detected
+            byteorder = ID3_UTF16_BYTEORDER_LE;
+            break;
+          }
+          if(c0 == 0x00 && c1 == 0x20) {
+            // BE space character detected
+            break;
+          }
+          if(c0 > 0)
+            nb0++;
+          if(c1 > 0)
+            nb1++;
+        }
+        if(i >= length/2) {
+          // No space character in the string: must use statistical approach
+          // by counting the number of Latin ISO 8 bit characters
+          if(nb1 < nb0)  
+            byteorder = ID3_UTF16_BYTEORDER_LE;
+        }
+      }
     }
   }

** Affects: libid3tag (Ubuntu)
     Importance: Undecided
         Status: New

-- 
You received this bug notification because you are a member of Kernel
Packages, which is subscribed to libid3tag in Ubuntu.
https://bugs.launchpad.net/bugs/1214624

Title:
  False endianness in BOM-less UTF-16 strings

Status in “libid3tag” package in Ubuntu:
  New

Bug description:
  In some mp3 files,  ID3V2 tags are encoded in  UTF-16 LE  without BOM.
  "libid3tag" defaults to BE. This false endianness creates "Mojibake" or pseudo "Chinese" characters.
  We experience this problem in Audacity.
  The following patch implements a reliable auto-detection of the endianness.

  Best regards,

  Joel Bouchat

  Index: utf16.c
  ===================================================================
  --- utf16.c	(revision 12463)
  +++ utf16.c	(working copy)
  @@ -256,17 +256,46 @@
       return 0;
   
     if (byteorder == ID3_UTF16_BYTEORDER_ANY && end - *ptr > 0) {
  -    switch (((*ptr)[0] << 8) |
  -	    ((*ptr)[1] << 0)) {
  -    case 0xfeff:
  -      byteorder = ID3_UTF16_BYTEORDER_BE;
  -      *ptr += 2;
  -      break;
  +    switch (((*ptr)[0] << 8) | ((*ptr)[1] << 0)) {
  +      case 0xfeff:
  +        byteorder = ID3_UTF16_BYTEORDER_BE;
  +        *ptr += 2;
  +        break;
   
  -    case 0xfffe:
  -      byteorder = ID3_UTF16_BYTEORDER_LE;
  -      *ptr += 2;
  -      break;
  +      case 0xfffe:
  +        byteorder = ID3_UTF16_BYTEORDER_LE;
  +        *ptr += 2;
  +        break;
  +
  +      default: {
  +        id3_length_t i; int nb0 = 0; int nb1 = 0;
  +        byteorder = ID3_UTF16_BYTEORDER_BE; // defaults to Big Endian
  +        // There is no BOM is this UTF16 string
  +        // Figure out the byte order
  +        for(i = 0; i < length/2; i+=2) {
  +          id3_byte_t c0 = (*ptr)[i];
  +          id3_byte_t c1 = (*ptr)[i+1];
  +          if(c0 == 0x20 && c1 == 0x00) {
  +            // LE space character detected
  +            byteorder = ID3_UTF16_BYTEORDER_LE;
  +            break;
  +          }
  +          if(c0 == 0x00 && c1 == 0x20) {
  +            // BE space character detected
  +            break;
  +          }
  +          if(c0 > 0)
  +            nb0++;
  +          if(c1 > 0)
  +            nb1++;
  +        }
  +        if(i >= length/2) {
  +          // No space character in the string: must use statistical approach
  +          // by counting the number of Latin ISO 8 bit characters
  +          if(nb1 < nb0)  
  +            byteorder = ID3_UTF16_BYTEORDER_LE;
  +        }
  +      }
       }
     }

To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu/+source/libid3tag/+bug/1214624/+subscriptions


Follow ups

References