← Back to team overview

anewt-developers team mailing list archive

[Branch ~uws/anewt/anewt.uws] Rev 1780: [core] Initial work on transitioning to UTF-8

 

------------------------------------------------------------
revno: 1780
committer: Wouter Bolsterlee <uws@xxxxxxxxx>
branch nick: anewt.uws
timestamp: Sat 2010-03-27 21:41:53 +0100
message:
  [core] Initial work on transitioning to UTF-8
  
  This is the 21st century, and UTF-8 is the only sane
  encoding to use when working with non-ascii data. PHP still
  lacks proper Unicode support, but does have the mbstring
  functions to deal with Unicode strings. Anewt now depends on
  the mbstring module, which is pretty standard nowadays
  anyway, and sets the mbstring encoding to UTF-8 by default.
  
  The str_truncate() function is now also UTF-8 aware. It uses
  the mbstring functions to calculate the truncation point,
  and uses a real ellipsis by default when truncating.
modified:
  core/module.doc.xml
  core/string.lib.php


--
lp:anewt
https://code.launchpad.net/~uws/anewt/anewt.uws

Your team Anewt developers is subscribed to branch lp:anewt.
To unsubscribe from this branch go to https://code.launchpad.net/~uws/anewt/anewt.uws/+edit-subscription.
=== modified file 'core/module.doc.xml'
--- core/module.doc.xml	2009-08-02 16:32:09 +0000
+++ core/module.doc.xml	2010-03-27 20:41:53 +0000
@@ -29,6 +29,16 @@
 
 		<anewt:title>String functions</anewt:title>
 
+		<anewt:note>
+
+			<p>Because this is the 21st century, Anewt sets the internal
+			encoding for the <code>mbstring</code> extension to UTF-8 upon
+			load. If you need to work with different encodings and transcoding
+			to something sane is not feasible, make sure to provide the right
+			encoding in all <code>mb_*</code> function calls.</p>
+
+		</anewt:note>
+
 		<p>PHP itself offers a lot of string utility functions, but some
 			<strong>commonly used string logic routines</strong> are not
 			implemented as simple, separarate functions. Often, this leads to

=== modified file 'core/string.lib.php'
--- core/string.lib.php	2009-04-12 23:21:36 +0000
+++ core/string.lib.php	2010-03-27 20:41:53 +0000
@@ -15,6 +15,10 @@
  */
 
 
+/* This is the 21st century. Thank you. */
+mb_internal_encoding('UTF-8');
+
+
 /**
  * Test if a string contains the specified substring. Use strpos or other string
  * functions if you want to know where the substring starts.
@@ -255,14 +259,14 @@
  * \return
  *   The resulting string after truncating and addition of the trail.
  */
-function str_truncate($str, $length=70, $trail='...', $use_word_boundaries=true)
+function str_truncate($str, $length=70, $trail='…', $use_word_boundaries=true)
 {
 	/* Use defaults for null parameters */
 	if (is_null($length))
 		$length = 70;
 	
 	if (is_null($trail))
-		$trail = '...';
+		$trail = '…';
 
 	if (is_null($use_word_boundaries))
 		$use_word_boundaries = true;
@@ -277,8 +281,11 @@
 	if ($length <= 0)
 		return '';
 
+	$length_str = mb_strlen($str);
+	$length_trail = mb_strlen($trail);
+
 	/* Don't truncate strings that don't need it */
-	if (strlen($str) <= $length)
+	if ($length_str <= $length)
 		return $str;
 
 	if ($use_word_boundaries)
@@ -286,10 +293,10 @@
 		/* We don't want to do computationally expensive regexp operations on
 		 * strings that are way too long, so we cut off before using regular
 		 * expresions. */
-		$str = substr($str, 0, $length - strlen($trail) + 1);
+		$str = mb_substr($str, 0, $length - $length_trail + 1);
 		$str = preg_replace('/\s+?(\S+)?$/', '', $str);
 
-		if (strlen($str) <= $length - strlen($trail))
+		if ($length_str <= $length - $length_trail)
 		{
 			/* Yay, all is fine because we found a usable word boundary! */
 			return $str . $trail;
@@ -304,7 +311,7 @@
 	}
 
 	/* Cut off, ignoring word boundaries. */
-	return substr($str, 0, $length - strlen($trail)) . $trail;
+	return mb_substr($str, 0, $length - $length_trail) . $trail;
 }
 
 /**