zorba-coders team mailing list archive

Thread
Date

[Merge] lp:~zorba-coders/zorba/fix_bug_871051 into lp:zorba/data-converters-module

To: mp+86700@xxxxxxxxxxxxxxxxxx
From: Sorin Marian Nasoi <sorin.marian.nasoi@xxxxxxxxx>
Date: Thu, 22 Dec 2011 13:30:46 -0000
Reply-to: mp+86700@xxxxxxxxxxxxxxxxxx
Sender: bounces@xxxxxxxxxxxxx

Sorin Marian Nasoi has proposed merging lp:~zorba-coders/zorba/fix_bug_871051 into lp:zorba/data-converters-module.

Requested reviews:
  Gabriel Petrovay (gabipetrovay)
  Sorin Marian Nasoi (sorin.marian.nasoi)
  Bruno Martins (bgmartins)
  Matthias Brantner (matthias-brantner)
  Diogo Simões (diogo-simoes89)
Related bugs:
  Bug #871051 in Zorba: "3 data-cleaning tests failing"
  https://bugs.launchpad.net/zorba/+bug/871051

For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/fix_bug_871051/+merge/86700

Fix for bug lp:871051.
-- 
https://code.launchpad.net/~zorba-coders/zorba/fix_bug_871051/+merge/86700
Your team Zorba Coders is subscribed to branch lp:zorba/data-converters-module.

=== added file 'CMakeLists.txt'
--- CMakeLists.txt	1970-01-01 00:00:00 +0000
+++ CMakeLists.txt	2011-12-22 13:29:42 +0000
@@ -0,0 +1,30 @@
+# Copyright 2006-2010 The FLWOR Foundation.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+
+PROJECT (zorba_data-cleaning_module)
+ENABLE_TESTING ()
+INCLUDE (CTest)
+
+LIST (APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules")
+
+FIND_PACKAGE (Zorba REQUIRED HINTS "${ZORBA_BUILD_DIR}")
+INCLUDE ("${Zorba_USE_FILE}")
+
+ADD_TEST_DIRECTORY("${PROJECT_SOURCE_DIR}/test" "${EXCEPTION_LIST}")
+
+ADD_SUBDIRECTORY("src")
+
+DONE_DECLARING_ZORBA_URIS()

=== renamed file 'CMakeLists.txt' => 'CMakeLists.txt.moved'
=== added directory 'cmake_modules'
=== renamed directory 'cmake_modules' => 'cmake_modules.moved'
=== added file 'cmake_modules/CMakeCompareVersionStrings.cmake'
--- cmake_modules/CMakeCompareVersionStrings.cmake	1970-01-01 00:00:00 +0000
+++ cmake_modules/CMakeCompareVersionStrings.cmake	2011-12-22 13:29:42 +0000
@@ -0,0 +1,84 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Computes the realtionship between two version strings.  A version
+# string is a number delineated by '.'s such as 1.3.2 and 0.99.9.1.
+# You can feed version strings with different number of dot versions,
+# and the shorter version number will be padded with zeros: 9.2 <
+# 9.2.1 will actually compare 9.2.0 < 9.2.1.
+#
+# Input: a_in - value, not variable
+#        b_in - value, not variable
+#        result_out - variable with value:
+#                         -1 : a_in <  b_in
+#                          0 : a_in == b_in
+#                          1 : a_in >  b_in
+#
+# Written by James Bigler.
+MACRO(COMPARE_VERSION_STRINGS a_in b_in result_out)
+  # Since SEPARATE_ARGUMENTS using ' ' as the separation token,
+  # replace '.' with ' ' to allow easy tokenization of the string.
+  STRING(REPLACE "." " " a "${a_in}")
+  STRING(REPLACE "." " " b "${b_in}")
+  SEPARATE_ARGUMENTS(a)
+  SEPARATE_ARGUMENTS(b)
+
+  # Check the size of each list to see if they are equal.
+  LIST(LENGTH a a_length)
+  LIST(LENGTH b b_length)
+
+  # Pad the shorter list with zeros.
+
+  # Note that range needs to be one less than the length as the for
+  # loop is inclusive (silly CMake).
+  IF(a_length LESS b_length)
+    # a is shorter
+    SET(shorter a)
+    MATH(EXPR range "${b_length} - 1")
+    MATH(EXPR pad_range "${b_length} - ${a_length} - 1")
+  ELSE(a_length LESS b_length)
+    # b is shorter
+    SET(shorter b)
+    MATH(EXPR range "${a_length} - 1")
+    MATH(EXPR pad_range "${a_length} - ${b_length} - 1")
+  ENDIF(a_length LESS b_length)
+
+  # PAD out if we need to
+  IF(NOT pad_range LESS 0)
+    FOREACH(pad RANGE ${pad_range})
+      # Since shorter is an alias for b, we need to get to it by by dereferencing shorter.
+      LIST(APPEND ${shorter} 0)
+    ENDFOREACH(pad RANGE ${pad_range})
+  ENDIF(NOT pad_range LESS 0)
+
+  SET(result 0)
+  FOREACH(index RANGE ${range})
+    IF(result EQUAL 0)
+      # Only continue to compare things as long as they are equal
+      LIST(GET a ${index} a_version)
+      LIST(GET b ${index} b_version)
+      # LESS
+      IF(a_version LESS b_version)
+        SET(result -1)
+      ENDIF(a_version LESS b_version)
+      # GREATER
+      IF(a_version GREATER b_version)
+        SET(result 1)
+      ENDIF(a_version GREATER b_version)
+    ENDIF(result EQUAL 0)
+  ENDFOREACH(index)
+
+  # Copy out the return result
+  SET(${result_out} ${result})
+ENDMACRO(COMPARE_VERSION_STRINGS)

=== added directory 'src'
=== renamed directory 'src' => 'src.moved'
=== added file 'src/CMakeLists.txt'
--- src/CMakeLists.txt	1970-01-01 00:00:00 +0000
+++ src/CMakeLists.txt	2011-12-22 13:29:42 +0000
@@ -0,0 +1,20 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+MESSAGE(STATUS "Add com")
+ADD_SUBDIRECTORY(com)
+
+MESSAGE(STATUS "End modules")

=== added directory 'src/com'
=== added file 'src/com/CMakeLists.txt'
--- src/com/CMakeLists.txt	1970-01-01 00:00:00 +0000
+++ src/com/CMakeLists.txt	2011-12-22 13:29:42 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(zorba-xquery)

=== added directory 'src/com/zorba-xquery'
=== added file 'src/com/zorba-xquery/CMakeLists.txt'
--- src/com/zorba-xquery/CMakeLists.txt	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/CMakeLists.txt	2011-12-22 13:29:42 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(www)

=== added directory 'src/com/zorba-xquery/www'
=== added file 'src/com/zorba-xquery/www/CMakeLists.txt'
--- src/com/zorba-xquery/www/CMakeLists.txt	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/CMakeLists.txt	2011-12-22 13:29:42 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(modules)

=== added directory 'src/com/zorba-xquery/www/modules'
=== added file 'src/com/zorba-xquery/www/modules/CMakeLists.txt'
--- src/com/zorba-xquery/www/modules/CMakeLists.txt	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/CMakeLists.txt	2011-12-22 13:29:42 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(data-cleaning)

=== added directory 'src/com/zorba-xquery/www/modules/data-cleaning'
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt'
--- src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt	2011-12-22 13:29:42 +0000
@@ -0,0 +1,40 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+                      VERSION 2.0 FILE "character-based-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+                      VERSION 2.0 FILE "consolidation.xq")
+
+DECLARE_ZORBA_SCHEMA( FILE whitepages_schema.xsd
+                      URI "http://api.whitepages.com/schema/";)
+                      
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+                      VERSION 2.0 FILE "conversion.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+                      VERSION 2.0 FILE "hybrid-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/normalization";
+                      VERSION 2.0 FILE "normalization.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";
+                      VERSION 2.0 FILE "phonetic-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+                      VERSION 2.0 FILE "set-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+                      VERSION 2.0 FILE "token-based-string-similarity.xq")

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,177 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides character-based string similarity functions 
+ : that view strings as sequences of characters, generally computing a similarity score
+ : that corresponds to the cost of transforming one string into another.
+ :
+ : These functions are particularly useful for matching near duplicate strings  
+ : in the presence of typographical errors. 
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the edit distance between two strings.
+ :
+ : This distance, also refered to as the Levenshtein distance, is defined as the minimum number 
+ : of edits needed to transform one string into the other, with the allowable edit operations 
+ : being insertion, deletion, or substitution of a single character.
+ :
+ : <br/>
+ : Example usage : <pre> edit-distance("FLWOR", "FLOWER") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 2 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return The edit distance between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq
+ :)
+declare function simc:edit-distance ( $s1 as xs:string, $s2 as xs:string ) as xs:integer {
+ if(string-length($s1) = 0) then string-length($s2) else
+ if(string-length($s2) = 0) then string-length($s1) else
+ min((
+  simc:edit-distance(substring($s1, 2), $s2) + 1 ,
+  simc:edit-distance($s1, substring($s2, 2)) + 1 ,
+  simc:edit-distance(substring($s1, 2), substring($s2, 2)) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then 0 else 1 )
+ ))
+};
+
+(:~
+ : Returns the Jaro similarity coefficient between two strings.
+ :
+ : This similarity coefficient is based on the number of transposed characters and on a 
+ : weighted sum of the percentage of matched characters held within the strings. The higher 
+ : the Jaro-Winkler value is, the more similar the strings are. The coefficient is 
+ : normalized such that 0 equates to no similarity and 1 is an exact match.
+ :
+ : <br/>
+ : Example usage : <pre> jaro("FLWOR Found.", "FLWOR Foundation") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5853174603174603 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return The Jaro similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/jaro.xq
+ :)
+declare function simc:jaro ( $s1 as xs:string, $s2 as xs:string ) as xs:double {
+ let $s    := for $i in ($s1,$s2) order by string-length($i) return $i
+ let $l1   := string-length($s[1])
+ let $l2   := string-length($s[2])
+ let $mt   := xs:integer((max(($l1,$l2)) div 2.0) - 1)
+ let $mc   := for $i in 1 to min( ($l1 , $l2) ) 
+              let $auxmatch := substring($s[2], max((1,$i - $mt)), $mt * 2 )
+              return for $j in 1 to string-length($auxmatch)  
+                     where substring($auxmatch, $j, 1) = substring($s[1], $i, 1)
+                     return <match char="{substring($s[1], $i, 1)}" pos1="{$i}" pos2="{$j + max((1,$i - $mt)) - 1}" />
+ let $m    := if (count($mc) = 0) then (1) else (count($mc))
+ let $t    := count( for $i in $mc, $j in $mc where $i/@pos1>$j/@pos1 and $i/@pos2<$j/@pos2 return $i )
+ let $dist := xs:double((($m div $l1) + ($m div $l2) + (($m - $t) div $m)) div 3)
+ return $dist
+};
+
+(:~
+ : Returns the Jaro-Winkler similarity coefficient between two strings.
+ :
+ : This similarity coefficient corresponds to an extension of the Jaro similarity coefficient that weights or
+ : penalizes strings based on their similarity at the beginning of the string, up to a given prefix size.
+ :
+ : <br/>
+ : Example usage : <pre> jaro-winkler("DWAYNE", "DUANE", 4, 0.1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.8577777777777778 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $prefix The number of characters to consider when testing for equal prefixes in the strings.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes.
+ : @return The Jaro-Winkler similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq
+ :)
+declare function simc:jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $prefix as xs:integer, $fact as xs:double ) as xs:double {
+ let $jaro := simc:jaro( $s1 , $s2 )
+ let $cc   := for $i in 1 to min(($prefix, string-length($s1), string-length($s2))) 
+              where substring($s1, 0, $i) = substring($s2, 0, $i) return $i
+ return ($jaro + ( $fact * max($cc) * ( 1 - $jaro ) ) )
+};
+
+(:~
+ : Returns the Needleman-Wunsch distance between two strings.
+ :
+ : The Needleman-Wunsch distance is similar to the basic edit distance metric, adding a 
+ : variable cost adjustment to the cost of a gap (i.e., an insertion or deletion) in the 
+ : distance metric.
+ :
+ : <br/>
+ : Example usage : <pre> needleman-wunsch("KAK", "KQRK", 1, 1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $score The score value.
+ : @param $penalty The penalty value.
+ : @return The Needleman-Wunsch distance between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq
+ :)
+declare function simc:needleman-wunsch ( $s1 as xs:string, $s2 as xs:string, $score as xs:integer, $penalty as xs:integer ) as xs:double{
+ 
+ if(string-length($s1) = 0) then string-length($s2)* - $penalty else
+ if(string-length($s2) = 0) then string-length($s1)* - $penalty else
+ max((
+  simc:needleman-wunsch(substring($s1, 2), $s2, $score, $penalty) - $penalty ,
+  simc:needleman-wunsch($s1, substring($s2, 2), $score, $penalty) - $penalty ,
+  simc:needleman-wunsch(substring($s1, 2), substring($s2, 2), $score, $penalty) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then $score else -$penalty )
+ ))
+};
+
+(:~
+ : Returns the Smith-Waterman distance between two strings.
+ :
+ : <br/>
+ : Example usage : <pre> smith-waterman("ACACACTA", "AGCACACA", 2, 1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 12 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $score The score value.
+ : @param $penalty The penalty value.
+ : @return The Smith-Waterman distance between the two strings.
+ :)
+declare function simc:smith-waterman ( $s1 as xs:string, $s2 as xs:string, $score as xs:integer, $penalty as xs:integer ) as xs:double{ 
+ if(string-length($s1) = 0) then 0 else
+ if(string-length($s2) = 0) then 0 else
+ max((
+  0,
+  simc:smith-waterman(substring($s1, 2), $s2, $score, $penalty) - $penalty ,
+  simc:smith-waterman($s1, substring($s2, 2), $score, $penalty) - $penalty ,
+  simc:smith-waterman(substring($s1, 2), substring($s2, 2), $score, $penalty) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then $score else -$penalty )
+ ))
+};

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,579 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data consolidation functions that generally take as input a sequence of XML nodes
+ : and apply some rule in order do decide which node is better suited to represent the entire sequence.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation, 
+ : although the consolidation functions based on matching sequences against XPath expressions require 
+ : some form of dynamic evaluation for XPath expressions,
+ : such as the x:eval() function provided in the Qizx XQuery Engine.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the single most frequent node in a sequence of nodes provided as input.
+ : If more then one answer is possible, returns the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-frequent( ( "a", "a", "b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The most frequent node in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-frequent.xq
+ :)
+declare function con:most-frequent ( $s ) {
+ (for $str in set:distinct($s) order by count($s[deep-equal(.,$str)]) descending return $str)[1]
+};
+
+(:~
+ : Returns the single less frequent node in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-frequent( ( "a", "a", "b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("b") </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The least frequent node in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/leastfrequent_1.xq
+ :)
+declare function con:least-frequent ( $s ) {
+ let $aux := for $str in set:distinct($s) order by count($s[deep-equal(.,$str)]) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single longest string, in terms of the number of characters, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> con:longest( ( "a", "aa", "aaa") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("aaa") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The longest string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/longest_1.xq
+ :)
+declare function con:longest ( $s as xs:string* ) as xs:string? {
+ let $aux := for $str in $s order by string-length($str) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single shortest string, in terms of the number of characters, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> shortest( ( "a", "aa", "aaa") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The shortest string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/shortest_1.xq
+ :)
+declare function con:shortest( $s as xs:string* ) as xs:string? {
+ let $aux := for $str in $s order by string-length($str) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single longest string, in terms of the number of tokens, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-tokens( ( "a b c", "a b", "a"), " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a b c") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The longest string in the input sequence, in terms of the number of tokens.
+ : @example test/Queries/data-cleaning/consolidation/most-tokens.xq
+ :)
+declare function con:most-tokens ( $s as xs:string*, $r as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by count(tokenize($str,$r)) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single shortest string, in terms of the number of tokens, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-tokens( ( "a b c", "a b", "a"), " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The shortest string in the input sequence, in terms of the number of tokens.
+ : @example test/Queries/data-cleaning/consolidation/least-tokens.xq
+ :)
+declare function con:least-tokens ( $s as xs:string*, $r as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by count(tokenize($str,$r)) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the strings from an input sequence of strings that match a particular regular expression.
+ :
+ : <br/>
+ : Example usage : <pre> matching( ( "a A b", "c AAA d", "e BB f"), "A+" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "a A b", "c AAA d") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r The regular expression to be used in the matching.
+ : @return The strings in the input sequence that match the input regular expression.
+ : @example test/Queries/data-cleaning/consolidation/matching_1.xq
+ :)
+declare function con:matching ( $s as xs:string*, $r as xs:string ) as xs:string* {
+ for $str in $s where matches($str,$r) return $str
+};
+
+(:~
+ : Returns the single string, from an input sequence of strings, that appears more frequently as part
+ : of the other strings in the sequence. If no such string exists, the function returns an empty sequence.
+ : If more then one answer is possible, the function returns the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> super-string( ( "aaa bbb ccc", "aaa bbb", "aaa ddd", "eee fff" ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "aaa bbb" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The string that appears more frequently as part of the other strings in the sequence.
+ : @example test/Queries/data-cleaning/consolidation/superstring_1.xq
+ :)
+declare function con:superstring ( $s as xs:string* ) as xs:string? {
+  let $aux :=
+  for $str in $s 
+  let $cnt := count ( for $str2 in $s return if(contains($str2,$str)) then $str else () )
+  where $cnt > 1
+  order by $cnt descending
+  return $str
+  return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single most similar string, in terms of the edit distance metric towards an input string, 
+ : in a sequence of strings provided as input. If more than one string has a maximum similarity (a minimum 
+ : value for the edit distance metric), the function return the first string according to the order of the 
+ : input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "aaabbb" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $m The string towards which we want to measure the edit distance.
+ : @return The most similar string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq
+ :)
+declare function con:most-similar-edit-distance ( $s as xs:string*, $m as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by simc:edit-distance($str,$m) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single least similar string, in terms of the edit distance metric towards an input string, 
+ : in a sequence of strings provided as input. If more than one string has a minimum similarity (a maximum 
+ : value for the edit distance metric), return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "eeefff" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $m The string towards which we want to measure the edit distance.
+ : @return The least similar string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq
+ :)
+declare function con:least-similar-edit-distance ( $s as xs:string*, $m as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by simc:edit-distance($str,$m) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single node having the largest number of descending elements (sub-elements at any given depth) 
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-elements( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;a/&gt;, &lt;b/&gt;) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;a&gt;&lt;b/&gt;&lt;/a&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-elements.xq
+ :)
+declare function con:most-elements ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::element()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of descending attributes (attributes at any given depth) 
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-attributes( ( &lt;a att1="a1" att2="a2"/&gt;, &lt;b att1="a1" /&gt;, &lt;c/&gt; ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;a att1="a1" att2="a2"/&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-attributes.xq
+ :)
+declare function con:most-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::*/attribute()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of descending nodes (sub-nodes at any given depth) in a 
+ : sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-nodes( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;a/&gt;, &lt;b/&gt;) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;a&gt;&lt;b/&gt;&lt;/a&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-nodes.xq
+ :)
+declare function con:most-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::node()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending elements (sub-elements at any given depth) 
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-elements( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;b&gt;&lt;c/&gt;&lt;/b&gt;, &lt;d/&gt;) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;d/&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-elements.xq
+ :)
+declare function con:least-elements ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::element()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending attributes (attributes at any given depth) 
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-attributes( ( &lt;a att1="a1" att2="a2"/&gt;, &lt;b att1="a1" /&gt;, &lt;c/&gt; ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;c/&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-attributes.xq
+ :)
+declare function con:least-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::*/attribute()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending nodes (sub-nodes at any given depth) 
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-nodes( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;b&gt;&lt;c/&gt;&lt;/b&gt;, &lt;d/&gt;) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;d/&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-nodes.xq
+ :)
+declare function con:least-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::node()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending elements (sub-elements at any 
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-elements( ( &lt;a&gt;&lt;b/&gt;&lt;c/&gt;&lt;d/&gt;&lt;/a&gt;, &lt;a&gt;&lt;b/&gt;&lt;b/&gt;&lt;c/&gt;&lt;/a&gt;, &lt;a/&gt; ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;a&gt;&lt;b/&gt;&lt;c/&gt;&lt;d/&gt;&lt;/a&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-elements.xq
+ :)
+declare function con:most-distinct-elements ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::element())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending attributes (attributes at any 
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-attributes( ( &lt;a att1="a1" att2="a2" att3="a3"/&gt;, &lt;a att1="a1" att2="a2"&gt;&lt;b att2="a2" /&gt;&lt;/a&gt;, &lt;c/&gt; ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;a att1="a1" att2="a2" att3="a3"/&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq
+ :)
+declare function con:most-distinct-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::*/attribute())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending nodes (sub-nodes at any given depth) in 
+ : a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-nodes( ( &lt;a>&lt;b/>&lt;/a&gt;, &lt;a>&lt;a/&gt;&lt;/a&gt;, &lt;b/&gt;) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;a&gt;&lt;b/&gt;&lt;/a&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq
+ :)
+declare function con:most-distinct-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::node())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending elements (sub-elements at any 
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-elements( ( &lt;a>&lt;b/&gt;&lt;/a&gt;, &lt;b&gt;&lt;c/&gt;&lt;/b&gt;, &lt;d/&gt;) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;d/&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-elements.xq
+ :)
+declare function con:least-distinct-elements ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::element())) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending attributes (attributes at any 
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-attributes( ( &lt;a att1="a1" att2="a2"/&gt;, &lt;b att1="a1" /&gt;, &lt;c/&gt; ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;c/&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq
+ :)
+declare function con:least-distinct-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::*/attribute())) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending nodes (sub-nodes at any given depth) 
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-nodes( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;b&gt;&lt;c/&gt;&lt;/b&gt;, &lt;d/&gt;) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;d/&gt;) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq
+ :)
+declare function con:least-distinct-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::node())) return $str)[1]
+};
+
+(:~
+ : Returns the elements from an input sequence of elements that, when matched to a given set of XPath expressions,
+ : produce a non-empty set of nodes in all the cases.
+ :
+ : <br/>
+ : Example usage : <pre> all-xpaths( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;c&gt;&lt;d/&gt;&lt;/c&gt;, &lt;d/&gt;), (".//b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (&lt;a&gt;&lt;b/&gt;&lt;/a&gt;) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The elements that, when matched to the given set of XPath expressions, always return a non-empty set of nodes.
+ :)
+declare function con:all-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ for $str in set:distinct($s)
+ where every $path in $paths satisfies count( 
+   x:eval( concat( "<xml>", 
+                   x:serialize ( $str , <options omit-xml-declaration="true" /> ), 
+                   if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") , 
+                   $path) ) ) > 0
+ return $str
+ :)
+ ""
+};
+
+(:~
+ : Returns the elements from a sequence of elements that, when matched to a given set of XPath expressions,
+ : produce a non-empty set of nodes for some of the cases.
+ :
+ : <br/>
+ : Example usage : <pre> some-xpaths( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;d&gt;&lt;c/&gt;&lt;/d&gt;, &lt;d/&gt;), (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt; , &lt;d&gt;&lt;c/&gt;&lt;/d&gt; ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The elements that, when matched to the given set of XPath expressions, return a non-empty set of nodes 
+ : for at least one of the cases.
+ :)
+declare function con:some-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ for $str in set:distinct($s)
+ where some $path in $paths satisfies count( 
+   x:eval( concat( "<xml>", 
+                   x:serialize ( $str , <options omit-xml-declaration="true" /> ), 
+                   if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") , 
+                   $path) ) ) > 0
+ return $str
+ :)
+  ""
+};
+
+(:~
+ : Returns the single element from an input sequence of elements that matches the largest number of 
+ : XPath expressions from a given set, producing a non-empty set of nodes.
+ : If more then one answer is possible, return the first element according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-xpaths( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;d&gt;&lt;c/&gt;&lt;b/&gt;&lt;/d&gt;, &lt;d/&gt;) , (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( &lt;d&gt;&lt;c/&gt;&lt;b/&gt;&lt;/d&gt; ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The element that matches the largest number of XPath expressions producing a non-empty set of nodes.
+ :)
+
+declare function con:most-xpaths ( $s as element()* , $paths as xs:string* ) {
+ (:
+ (
+  for $str in set:distinct($s) 
+  let $cnt := sum( for $path in $paths where count(
+   x:eval( concat( "<xml>", 
+                   x:serialize ( $str , <options omit-xml-declaration="true" /> ), 
+                   if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") , 
+                   $path) ) ) > 0 return 1 )
+  order by $cnt descending
+  return $str
+ )[1]
+ :)
+ ""
+};
+
+(:~
+ : Returns the single element from an input sequence of elements that matches the smallest number of 
+ : XPath expressions from a given set, producing a non-empty set of nodes.
+ : If more then one answer is possible, return the first element according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-xpaths( ( &lt;a&gt;&lt;b/&gt;&lt;/a&gt;, &lt;d&gt;&lt;c/&gt;&lt;b/&gt;&lt;/d&gt;, &lt;d/&gt;) , (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( $lt;d/&gt; ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The element that matches the smallest number of XPath expressions producing a non-empty set of nodes.
+ :)
+
+declare function con:least-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ (
+  for $str in set:distinct($s) 
+  let $cnt := sum( for $path in $paths where count(
+   x:eval( concat( "<xml>", 
+                   x:serialize ( $str , <options omit-xml-declaration="true" /> ), 
+                   if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") , 
+                   $path) ) ) > 0 return 1 )
+  order by $cnt
+  return $str
+ )[1]
+ :)
+ ""
+};
+
+(:~
+ : Returns the nodes from an input sequence of nodes that validate against a given XML Schema.
+ :
+ : <br/>
+ : Example usage : <pre> validating-schema ( ( &lt;a/&gt; , &lt;b/&gt; ), &lt;xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"&gt;&lt;xs:element name="a" /&gt;&lt;/xs:schema&gt; ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( &lt;a/&gt; ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $schema An element encoding an XML Schema.
+ : @return The nodes that validate against the XML Schema.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function con:validating-schema ( $s as element()*, $schema as element() ) {
+ false()
+};

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,407 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data conversion functions for processing calendar dates, 
+ : temporal values, currency values, units of measurement, location names and postal addresses.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+declare namespace exref = "http://www.ecb.int/vocabulary/2002-08-01/eurofxref";;
+declare namespace ann = "http://www.zorba-xquery.com/annotations";;
+
+import schema namespace wp = 'http://api.whitepages.com/schema/';
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";;
+
+import module namespace reflection = "http://www.zorba-xquery.com/modules/reflection";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~ The key to be used when accessing the White Pages Web service :)
+declare variable $conversion:key := "06ea2f21cc15602b6a3e242e3225a81a";
+
+(:~
+ : Uses a White-pages Web service to discover information about a given name, 
+ : returning a sequence of strings for the phone numbers associated to the name.
+ :
+ :
+ : @param $name The name of person or organization.
+ : @return A sequence of strings for the phone numbers associated to the name.
+ : @example test/Queries/data-cleaning/conversion/phone-from-user.xq
+ :)
+declare %ann:nondeterministic function conversion:phone-from-user ( $name as xs:string) as xs:string*{
+	let $name-value := replace($name, " ", "%20")
+	let $url := concat("http://api.whitepages.com/find_person/1.0/?name=",$name-value,";api_key=",$conversion:key)
+	let $doc := http:get-node($url)[2]
+	return
+    $doc/wp:wp/wp:listings/wp:listing/wp:phonenumbers/wp:phone/wp:fullphone/text()
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given name, 
+ : returning a sequence of strings for the addresses associated to the name.
+ :
+ :
+ : @param $name The name of person or organization.
+ : @return A sequence of strings for the addresses associated to the name.
+ : @example test/Queries/data-cleaning/conversion/address-from-user.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-user ( $name as xs:string) as xs:string*{
+	let $name-value := replace($name, " ", "%20")
+	let $url := concat("http://api.whitepages.com/find_person/1.0/?name=",$name-value,";api_key=",$conversion:key)
+	let $doc := http:get-node($url)[2]
+	for $a in $doc/wp:wp/wp:listings/wp:listing/wp:address
+		let $fullstreet := $a/wp:fullstreet/text()
+		let $city := $a/wp:city/text()
+		let $state := $a/wp:state/text()
+		let $country := $a/wp:country/text()
+		return concat($fullstreet, ", ", $city, ", ", $state, ", ", $country)
+};
+
+
+(:~
+ : Uses a White-pages Web service to discover information about a given phone number, 
+ : returning a sequence of strings for the name associated to the phone number.
+ :
+ :
+ : @param $phone-number A string with 10 digits corresponding to the phone number.
+ : @return A sequence of strings for the person or organization's name associated to the phone number.
+ : @example test/Queries/data-cleaning/conversion/user-from-phone.xq
+ :)
+declare %ann:nondeterministic function conversion:user-from-phone ( $phone-number as xs:string) as xs:string*{
+	let $url := concat("http://api.whitepages.com/reverse_phone/1.0/?phone=",$phone-number,";api_key=",$conversion:key)
+	let $doc := http:get-node($url)[2]
+	return $doc/wp:wp/wp:listings/wp:listing/wp:displayname/text()	
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given phone number, 
+ : returning a string for the address associated to the phone number.
+ :
+ :
+ : @param $phone-number A string with 10 digits corresponding to the phone number.
+ : @return A string for the addresses associated to the phone number.
+ : @example test/Queries/data-cleaning/conversion/address-from-phone.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-phone ( $phone-number as xs:string) as xs:string*{
+	let $url := concat("http://api.whitepages.com/reverse_phone/1.0/?phone=",$phone-number,";api_key=",$conversion:key)
+	let $doc := http:get-node($url)[2]
+	let $addresses :=
+		for $a in $doc/wp:wp/wp:listings/wp:listing/wp:address
+			let $fullstreet := $a/wp:fullstreet/text()
+		  let $city := $a/wp:city/text()
+		  let $state := $a/wp:state/text()
+		  let $country := $a/wp:country/text()
+			return concat($fullstreet, ", ", $city, ", ", $state, ", ", $country)
+	return distinct-values($addresses)           
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given address, 
+ : returning a sequence of strings for the names associated to the address.
+ :
+ :
+ : @param $address A string corresponding to the address (ex: 5655 E Gaskill Rd, Willcox, AZ, US).
+ : @return A sequence of strings for the person or organization's names associated to the address.
+ : @example test/Queries/data-cleaning/conversion/user-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:user-from-address ( $address as xs:string) as xs:string*{
+	let $tokens := tokenize ($address, ",")
+	let $token-full-street := $tokens[position()=1]
+	let $state := 
+		if (count($tokens) = 4)
+		then replace($tokens[position()=3], " ", "")
+		else
+			if (count($tokens) = 5)
+			then replace($tokens[position()=4], " ", "")
+			else()
+	let $house := tokenize($token-full-street, " ")[position()=1]
+	let $street := replace(replace($token-full-street, "[0-9]+[ ]", ""), " ", "%20")
+	let $url := concat("http://api.whitepages.com/reverse_address/1.0/?house=",$house, ";street=",$street, ";state=",$state,";api_key=",$conversion:key)
+	let $doc := http:get-node($url)[2]
+	return $doc/wp:wp/wp:listings/wp:listing/wp:displayname/text()
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given address, 
+ : returning a sequence of strings for the phone number associated to the address.
+ :
+ :
+ : @param $address A string corresponding to the address (ex: 5655 E Gaskill Rd, Willcox, AZ, US).
+ : @return A sequence of strings for the phone number or organization's names associated to the address.
+ : @example test/Queries/data-cleaning/conversion/phone-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:phone-from-address ( $address as xs:string) as xs:string*{
+	let $tokens := tokenize ($address, ",")
+	let $token-full-street := $tokens[position()=1]
+	let $state := 
+		if (count($tokens) = 4)
+		then replace($tokens[position()=3], " ", "")
+		else
+			if (count($tokens) = 5)
+			then replace($tokens[position()=4], " ", "")
+			else()
+	let $house := replace($token-full-street, "([A-Za-z]+|[0-9]+[A-Za-z][A-Za-z]|[ ]+)", "")
+	let $street-w-space := replace($token-full-street, $house, "")
+	let $street := 
+		if (substring($street-w-space, 1, 1) = " ")
+		then substring($street-w-space, 2)
+		else
+			if(substring($street-w-space, string-length($street-w-space), 1) = " ")
+			then substring($street-w-space, 1, string-length($street-w-space)-1)
+			else ()
+	let $street-form := replace($street, " ", "%20")
+	let $url := concat("http://api.whitepages.com/reverse_address/1.0/?house=",$house, ";street=",$street-form, ";state=",$state,";api_key=",$conversion:key)
+	let $doc := http:get-node($url)[2]
+	return $doc/wp:wp/wp:listings/wp:listing/wp:phonenumbers/wp:phone/wp:fullphone/text()(: if($state = "TN") then "iguais" else "dif":)
+};
+
+(:~
+ : Conversion function for units of measurement, acting as a wrapper over the CuppaIT WebService.
+ : <br/>
+ :
+ :
+ : @param $v The amount we wish to convert.
+ : @param $t The type of metric (e.g., "Distance")
+ : @param $m1 The source measurement unit metric (e.g., "meter")
+ : @param $m2 The target measurement unit metric (e.g., "mile")
+ : @return The value resulting from the conversion
+ : @example test/Queries/data-cleaning/conversion/unit-convert.xq
+ :)
+declare %ann:nondeterministic function conversion:unit-convert ( $v as xs:double, $t as xs:string, $m1 as xs:string, $m2 as xs:string ) {
+ if ( $m1 = $m2 ) then $v else
+
+let $conversion-table := 
+ <unit-conversion-rules>
+ <unit type="Distance" from="mile" to="kilometer" value="1.609344" />
+ <unit type="Distance" from="mile" to="angstrom" value="16100000000000" />
+ <unit type="Distance" from="mile" to="picometer" value="1610000000000000" />
+ <unit type="Distance" from="mile" to="nanometer" value="1610000000000" />
+ <unit type="Distance" from="mile" to="microometer" value="1610000000" />
+ <unit type="Distance" from="mile" to="millimeter" value="1610000" />
+ <unit type="Distance" from="mile" to="centimeter" value="161000" />
+ <unit type="Distance" from="mile" to="meter" value="1610" />
+ <unit type="Distance" from="mile" to="inch" value="63400" />
+ <unit type="Distance" from="mile" to="feet" value="5280" />
+ <unit type="Distance" from="kilometer" to="meter" value="1000" />
+ <unit type="Distance" from="kilometer" to="picometer" value="1000000000000000" />
+ <unit type="Distance" from="kilometer" to="angstrom" value="10000000000000" />
+ <unit type="Distance" from="kilometer" to="nanometer" value="1000000000000" />
+ <unit type="Distance" from="kilometer" to="micrometer" value="1000000000" />
+ <unit type="Distance" from="kilometer" to="millimeter" value="1000000" />
+ <unit type="Distance" from="kilometer" to="centimeter" value="100000" />
+ <unit type="Distance" from="kilometer" to="inch" value="39400" />
+ <unit type="Distance" from="kilometer" to="feet" value="3280" />
+ <unit type="Distance" from="meter" to="centimeter" value="100" />
+ <unit type="Distance" from="meter" to="picometer" value="1000000000000" />
+ <unit type="Distance" from="meter" to="angstrom" value="10000000000" />
+ <unit type="Distance" from="meter" to="nanometer" value="1000000000" />
+ <unit type="Distance" from="meter" to="micrometer" value="1000000" />
+ <unit type="Distance" from="meter" to="millimeter" value="1000" />
+ <unit type="Distance" from="meter" to="inch" value="39.4" />
+ <unit type="Distance" from="meter" to="feet" value="3.28" />
+ <unit type="Distance" from="centimeter" to="millimeter" value="10" />
+ <unit type="Distance" from="millimeter" to="micrometer" value="1000" />
+ <unit type="Distance" from="micrometer" to="nanometer" value="1000" />
+ <unit type="Distance" from="nanometer" to="angstrom" value="10" />
+ <unit type="Distance" from="angstrom" to="picometer" value="100" />
+ <unit type="Distance" from="inch" to="feet" value="0.0833" />
+ <unit type="Mass" from="tons" to="kilograms" value="907.18474" />
+ <unit type="Mass" from="tons" to="pounds" value="2000" />
+ <unit type="Mass" from="tons" to="ounces" value="32000" />
+ <unit type="Mass" from="tons" to="grams" value="907184.74" />
+ <unit type="Mass" from="tons" to="milligrams" value="907184740" />
+ <unit type="Mass" from="kilograms" to="pounds" value="2.2046226" />
+ <unit type="Mass" from="kilograms" to="grams" value="1000" />
+ <unit type="Mass" from="kilograms" to="milligrams" value="1000000" />
+ <unit type="Mass" from="grams" to="milligrams" value="1000" />
+ <unit type="Mass" from="pounds" to="ounces" value="16" />
+ <unit type="Mass" from="pounds" to="grams" value="453.59237" />
+ <unit type="Mass" from="pounds" to="milligrams" value="453592.37" />
+ <unit type="Mass" from="ounces" to="kilograms" value="0.028349523" />
+ <unit type="Mass" from="ounces" to="grams" value="28.349523" />
+ <unit type="Mass" from="ounces" to="milligrams" value="28349.523" />
+ <unit type="Volume" from="liters" to="cubic centimeters" value="1000" />
+ <unit type="Energy" from="jouls" to="calories" value="0.239" />
+ <unit type="Pressure" from="pascals" to="kilopascals" value="0.001" />
+ <unit type="Pressure" from="pascals" to="bars" value="0.000001" />
+ <unit type="Pressure" from="pascals" to="mmHg" value="0.00750064" />
+ <unit type="Pressure" from="pascals" to="torrs" value="0.00750064" />
+ <unit type="Pressure" from="atmospheres" to="pascals" value="101325" />
+ <unit type="Pressure" from="atmospheres" to="kilopascals" value="101.325" />
+ <unit type="Pressure" from="atmospheres" to="bars" value="1.01325" />
+ <unit type="Pressure" from="atmospheres" to="mmHg" value="760" />
+ <unit type="Pressure" from="atmospheres" to="torrs" value="760" />
+ <unit type="Pressure" from="atmospheres" to="psi" value="14.7" />
+ <unit type="Pressure" from="psi" to="pascals" value="6890" />
+ <unit type="Pressure" from="psi" to="kilopascals" value="6.89" />
+ <unit type="Pressure" from="psi" to="bars" value="0.0689" />
+ <unit type="Pressure" from="psi" to="mmHg" value="51.7" />
+ <unit type="Pressure" from="psi" to="torrs" value="51.7" />
+ <unit type="Pressure" from="bars" to="kilopascals" value="100" />
+ <unit type="Pressure" from="bars" to="mmHg" value="750.064" />
+ <unit type="Pressure" from="bars" to="torrs" value="750.064" />
+ <unit type="Pressure" from="kilopascals" to="mmHg" value="7.50064" />
+ <unit type="Pressure" from="kilopascals" to="torrs" value="7.50064" />
+ <unit type="Pressure" from="mmHg" to="torrs" value="1" />
+ <unit type="Temperature" from="celsius" to="fahrenheit" value="* 9 div 5 + 32" />
+ <unit type="Temperature" from="celsius" to="kelvin" value="+ 273.15" />
+ <unit type="Temperature" from="kelvin" to="celsius" value="- 273.15" />
+ <unit type="Temperature" from="kelvin" to="fahrenheit" value="* 9 div 5 - 273.15 * 9 div 5 + 32" />
+ <unit type="Temperature" from="fahrenheit" to="celsius" value="* 5 div 9 - 32 * 5 div 9" />
+ <unit type="Temperature" from="fahrenheit" to="kelvin" value="* 5 div 9 - 32 * 5 div 9 + 273.15" />
+</unit-conversion-rules>
+
+let $from := $conversion-table/unit[@type=$t and @from=$m1] | 
+             ( for $it in $conversion-table/unit[@type=$t and @to=$m1] return 
+               if (compare($t, "Temperature") != 0) then
+               copy $aux := $it
+               modify (
+                replace value of node $aux/@value with 1.0 div $aux/@value,
+                replace value of node $aux/@from with $aux/@to,
+                replace value of node $aux/@to with $aux/@from 
+               ) 
+               return $aux
+               else()
+             )
+
+return 
+if   (compare($t, "Temperature") = 0) then reflection:eval(concat($v , $conversion-table//unit[@from=$m1][@to=$m2]/@value))
+else
+	if   ( $from[@to=$m2]) then ( $v * $from[@to=$m2]/@value )
+	else ( for $i in $from return conversion:unit-convert ( $v * $i/@value , $t , $i/@to , $m2 ) )[1]
+};
+
+(:~
+ : Placename to geospatial coordinates converter, acting as a wrapper over the Yahoo! geocoder service.
+ :
+ : 
+ : @param $q A sequence of strings corresponding to the different components (e.g., street, city, country, etc.) of the place name.
+ : @return The pair of latitude and longitude coordinates associated with the input address.
+ : @example test/Queries/data-cleaning/conversion/geocode-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:geocode-from-address ( $q as xs:string* ) as xs:double* {
+ let $id   := ""
+ let $url  := "http://where.yahooapis.com/geocode?q=";
+ let $q2   := string-join(for $i in $q return translate($i," ","+"),",")
+ let $call := concat($url,$q2,"&amp;appid=",$id)
+ let $doc  := http:get-node($call)[2]
+ return    ( xs:double($doc/ResultSet/Result/latitude/text()) , xs:double($doc/ResultSet/Result/longitude/text()) )
+};
+
+(:~
+ : Geospatial coordinates to placename converter, acting as a wrapper over the Yahoo! reverse geocoder service.
+ :
+ :
+ : @param $lat Geospatial latitude.
+ : @param $lon Geospatial longitude.
+ : @return The sequence of strings corresponding to the different components (e.g., street, city, country, etc.) of the place name that corresponds to the input geospatial coordinates.
+ : @example test/Queries/data-cleaning/conversion/address-from-geocode.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-geocode ( $lat as xs:double, $lon as xs:double ) as xs:string* {
+ let $id   := ""
+ let $url  := "http://where.yahooapis.com/geocode?q=";
+ let $q    := concat($lat,",+",$lon) 
+ let $call := concat($url,$q,"&amp;gflags=R&amp;appid=",$id)
+ let $doc  := http:get-node($call)[2]
+ return distinct-values( (if (string-length($doc//xs:string(*:country)) > 0) then ($doc//xs:string(*:country)) else (),
+                          if (string-length($doc//xs:string(*:state)) > 0) then ($doc//xs:string(*:state)) else (),
+                          if (string-length($doc//xs:string(*:county)) > 0) then ($doc//xs:string(*:county)) else (),
+                          if (string-length($doc//xs:string(*:city)) > 0) then ($doc//xs:string(*:city)) else (),
+			  if (string-length($doc//xs:string(*:neighborhood)) > 0) then ($doc//xs:string(*:neighborhood)) else (),
+                          if (string-length($doc//xs:string(*:street)) > 0) then ($doc//xs:string(*:street)) else (),
+                          if (string-length($doc//xs:string(*:house)) > 0) then ($doc//xs:string(*:house)) else () ) )
+};
+
+(:~
+ : Currency conversion function, acting as a wrapper over the WebService from the European Central Bank.
+ :
+ : WebService documentation at http://www.ecb.int/stats/exchange/eurofxref/html/index.en.html
+ :
+ :
+ : @param $v The amount we wish to convert.
+ : @param $m1 The source currency (e.g., "EUR").
+ : @param $m2 The target currency (e.g., "USD").
+ : @param $date The reference date.
+ : @return The value resulting from the conversion.
+ : @error conversion:notsupported if the date, the source currency type or the target currency type are not known to the service.
+ : @see http://www.ecb.int/stats/exchange/eurofxref/html/index.en.html
+ : @example test/Queries/data-cleaning/conversion/currency-convert.xq
+ :)
+declare %ann:nondeterministic function conversion:currency-convert ( $v as xs:double, $m1 as xs:string, $m2 as xs:string, $date as xs:string ) {
+ let $daily   := "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml";
+ let $hist    := "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist.xml";
+ let $doc     := if (string-length($date) = 0) then http:get-node($daily)[2] else
+                 ((for $a in http:get-node($hist)[2]//exref:Cube[
+                   xs:string(@time)<=$date] order by $a/xs:string(@time) descending return $a)[1])
+ let $toEUR   := if ( $m1="EUR" ) then (xs:double(1.0)) else ( $doc//exref:Cube[xs:string(@currency)=$m1]/xs:double(@rate) )
+ let $fromEUR := if ( $m2="EUR" ) then (xs:double(1.0)) else ( $doc//exref:Cube[xs:string(@currency)=$m2]/xs:double(@rate) )
+ let $result  := ($v div $toEUR) * $fromEUR
+ return if (matches(string($result),"-?[0-9]+(\.[0-9]+)?")) then ($result) 
+        else (error(QName('http://www.zorba-xquery.com/modules/data-cleaning/conversion', 'conversion:notsupported'), data($result)))
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings 
+ : for the phone numbers associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the phone numbers associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:phone-from-domain ( $domain as xs:string ) {
+ ()
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings 
+ : for the addresses associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the addresses associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:address-from-domain ( $domain as xs:string ) {
+ ()
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings 
+ : for the person or organization names associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the person or organization names associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:name-from-domain ( $domain as xs:string ) {
+ ()
+};

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,223 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides hybrid string similarity functions, combining the properties of 
+ : character-based string similarity functions and token-based string similarity functions.
+ : 
+ : The logic contained in this module is not specific to any particular XQuery implementation,
+ : although the module requires the trigonometic functions of XQuery 1.1 or a math extension 
+ : function such as sqrt($x as numeric) for computing the square root.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+(: In the QizX os Saxon XQuery engines, it is possible to call external functions from the Java math library :)
+(: declare namespace math = "java:java.lang.Math"; :)
+declare namespace math = "http://www.w3.org/2005/xpath-functions/math";;
+
+import module namespace set  = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Soundex phonetic similarity function is used to discover token identity, which is equivalent to saying that
+ : this function returns the cosine similarity coefficient between sets of Soundex keys.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-soundex("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets of Soundex keys extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq
+ :)
+declare function simh:soft-cosine-tokens-soundex ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $keys1 := for $kt1 in tokenize($s1,$r) return simp:soundex-key($kt1)
+ let $keys2 := for $kt1 in tokenize($s2,$r) return simp:soundex-key($kt1)
+ return simt:cosine($keys1, $keys2)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings. 
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Metaphone phonetic similarity function is used to discover token identity, which is equivalent to saying that
+ : this function returns the cosine similarity coefficient between sets of Metaphone keys.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-metaphone("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets Metaphone keys extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq
+ :)
+declare function simh:soft-cosine-tokens-metaphone ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $keys1 := for $kt1 in tokenize($s1,$r) return simp:metaphone-key($kt1)
+ let $keys2 := for $kt1 in tokenize($s2,$r) return simp:metaphone-key($kt1)
+ return simt:cosine($keys1, $keys2)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings. 
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Edit Distance similarity function is used to discover token identity, and tokens having an edit distance 
+ : bellow a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-edit-distance("The FLWOR Foundation", "FLWOR Found.", " +", 0 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.408248290463863 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ :)
+declare function simh:soft-cosine-tokens-edit-distance ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:integer ) as xs:double {
+(:
+ let $tokens1   := tokenize($s1,$r)
+ let $tokens2   := tokenize($s2,$r)
+ let $tokens    := ($tokens1, $tokens2)
+ let $vocab     := for $a at $apos in $tokens
+                   where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:edit-distance($ba,$a) <= $t)
+                   return $a
+ let $freq1     := for $a1 in $vocab return count($tokens1[simc:edit-distance(.,$a1) <= $t])
+ let $freq2     := for $a2 in $vocab return count($tokens2[simc:edit-distance(.,$a2) <= $t])
+ let $freq1pow  := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow  := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult      := for $freq at $pos in $freq1 return $freq * $freq2[$pos] 
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+ :)
+ xs:double(0)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings. 
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Jaro similarity function is used to discover token identity, and tokens having a Jaro similarity above
+ : a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-jaro("The FLWOR Foundation", "FLWOR Found.", " +", 1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq
+ :)
+declare function simh:soft-cosine-tokens-jaro ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:double ) as xs:double {
+ let $tokens1   := tokenize($s1,$r)
+ let $tokens2   := tokenize($s2,$r)
+ let $tokens    := ($tokens1, $tokens2)
+ let $vocab     := for $a at $apos in $tokens
+                   where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:jaro($ba,$a) >= $t)
+                   return $a
+ let $freq1     := for $a1 in $vocab return count($tokens1[simc:jaro(.,$a1) >= $t])
+ let $freq2     := for $a2 in $vocab return count($tokens2[simc:jaro(.,$a2) >= $t])
+ let $freq1pow  := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow  := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult      := for $freq at $pos in $freq1 return $freq * $freq2[$pos] 
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings. 
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Jaro-Winkler similarity function is used to discover token identity, and tokens having a Jaro-Winkler
+ : similarity above a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-jaro-winkler("The FLWOR Foundation", "FLWOR Found.", " +", 1, 4, 0.1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.45 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @param $prefix The number of characters to consider when testing for equal prefixes with the Jaro-Winkler metric.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes with the Jaro-Winkler metric.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq
+ :)
+declare function simh:soft-cosine-tokens-jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:double, $prefix as xs:integer?, $fact as xs:double? ) as xs:double {
+ let $tokens1   := tokenize($s1,$r)
+ let $tokens2   := tokenize($s2,$r)
+ let $tokens    := ($tokens1, $tokens2)
+ let $vocab     := for $a at $apos in $tokens
+                   where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:jaro-winkler($ba,$a,$prefix,$fact) >= $t)
+                   return $a
+ let $freq1     := for $a1 in $vocab return count($tokens1[simc:jaro-winkler(.,$a1,$prefix,$fact) >= $t])
+ let $freq2     := for $a2 in $vocab return count($tokens2[simc:jaro-winkler(.,$a2,$prefix,$fact) >= $t])
+ let $freq1pow  := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow  := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult      := for $freq at $pos in $freq1 return $freq * $freq2[$pos] 
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the Monge-Elkan similarity coefficient between two strings, using the Jaro-Winkler 
+ : similarity function to discover token identity.
+ :
+ : <br/>
+ : Example usage : <pre> monge-elkan-jaro-winkler("Comput. Sci. and Eng. Dept., University of California, San Diego", "Department of Computer Scinece, Univ. Calif., San Diego", 4, 0.1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.992 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $prefix The number of characters to consider when testing for equal prefixes with the Jaro-Winkler metric.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes with the Jaro-Winkler metric.
+ : @return The Monge-Elkan similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq
+ :)
+declare function simh:monge-elkan-jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $prefix as xs:integer, $fact as xs:double  ) as xs:double{
+ let $s1tokens := tokenize($s1, " ") 
+ let $s2tokens := tokenize($s1, " ") 
+ let $length := min((count($s1tokens), count($s2tokens)))
+ let $res := for $s1n in $s1tokens 
+             return max(for $s2n in $s2tokens return simc:jaro-winkler($s1n,$s2n,$prefix,$fact))		
+ return (1 div $length) * sum($res)
+};

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1560 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data normalization functions for processing calendar dates, 
+ : temporal values, currency values, units of measurement, location names and postal addresses.
+ :
+ : These functions are particularly useful for converting different data representations into cannonical formats.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";;
+
+declare namespace ann = "http://www.zorba-xquery.com/annotations";;
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Converts a given string representation of a date value into a date representation valid according 
+ : to the corresponding XML Schema type.
+ :
+ :
+ : @param $sd The string representation for the date
+ : @param $format An optional parameter denoting the format used to represent the date in the string, according to a 
+ : sequence of conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed 
+ : by a single letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion 
+ : specification is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ : <pre>
+ : '%b' Abbreviated month name in the current locale.<br/>
+ : '%B' Full month name in the current locale.<br/>
+ : '%d' Day of the month as decimal number (01-31).<br/>
+ : '%m' Month as decimal number (01-12).<br/>
+ : '%x' Date, locale-specific.<br/>
+ : '%y' Year without century (00-99).<br/>
+ : '%Y' Year with century.<br/>
+ : '%C' Century (00-99): the integer part of the year divided by 100.<br/>
+ : '%D' Locale-specific date format such as '%m/%d/%y'.<br/>
+ : '%e' Day of the month as decimal number (1-31), with a leading pace for a single-digit number.<br/>
+ : '%F' Equivalent to %Y-%m-%d (the ISO 8601 date format).<br/>
+ : '%h' Equivalent to '%b'.<br/> 
+ :</pre>
+ :
+ : @return The date value resulting from the conversion.
+ : @example test/Queries/data-cleaning/normalization/to-date.xq
+ :)
+declare function normalization:to-date ( $sd as xs:string, $format as xs:string? ) as xs:string{
+
+ 
+ let $dictionary := normalization:month-dictionary()
+ let $format-tokens := tokenize($format, "[ %\-/:]+")[position()>1] 
+ let $sd-tokens := 
+ 	if (contains($sd, "-") or contains($sd, "/") or contains($sd, " "))
+ 	then tokenize ($sd, "[ \-/]+")
+ 	else let $ydtoken := tokenize(replace($sd, "[A-Za-z]", " "), " ")
+	     let $ft := $ydtoken[position()=1]
+	     let $lt := $ydtoken[last()]
+	     let $mtoken := replace($sd, "[0-9]", "") return ($ft, $mtoken, $lt)
+ return
+ if (count($sd-tokens)>1) 
+	then  	
+		let $year := 
+			if (count(index-of($format-tokens, "F")) != 0) then string($sd-tokens[position() = 1])
+			else	
+			
+			if (count(index-of($format-tokens, "D")) != 0) then concat("19", string($sd-tokens[position() = 3]))
+			else
+
+			if (count(index-of($format-tokens, "Y")) != 0)
+			then string($sd-tokens[position() = index-of($format-tokens, "Y")]) else 
+				
+			if (count(index-of($format-tokens, "y")) != 0)
+			then 
+			  if(count(index-of($format-tokens, "C")) !=0)
+			  then concat(string(number(string($sd-tokens[position() = index-of($format-tokens, "C")]))-1), string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+			  else 
+			   concat("19", string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+			else "YND"
+
+		let $month := 
+			if (count(index-of($format-tokens, "h")) != 0)
+			then string($dictionary//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "h")]]/@value) else
+			
+			if (count(index-of($format-tokens, "b")) != 0)
+			then string($dictionary//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "b")]]/@value)
+			else 
+			
+			if (count(index-of($format-tokens, "B")) != 0)
+			then string($dictionary//month[lower-case(@name) = 
+				lower-case($sd-tokens[position() = index-of($format-tokens, "B")])]/@value)
+			   
+			else 
+
+			if (count(index-of($format-tokens, "F")) != 0)
+			then string($sd-tokens[position() = 2])	
+			else 
+			
+			if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 1])
+			else
+
+			if (count(index-of($format-tokens, "m")) != 0)
+			then string($sd-tokens[position() = index-of($format-tokens, "m")])
+			
+			else "MND"
+
+	      	let $day := 
+			if (count(index-of($format-tokens, "F")) != 0) 
+			then string($sd-tokens[position() = 3]) else 
+			
+			if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 2])
+			else
+  
+			if (count(index-of($format-tokens, "d")) != 0) 
+			then $sd-tokens[position() = index-of($format-tokens, "d")] else
+
+			if (count(index-of($format-tokens, "e")) != 0)
+			then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+			else "DND"
+	
+	let $result := concat($year, "-", $month, "-", $day)
+	
+	return normalization:check-date($result)
+	else()
+	
+};
+
+(:~
+ : Converts a given string representation of a time value into a time representation valid according to 
+ : the corresponding XML Schema type.
+ :
+ :
+ : @param $sd The string representation for the time.
+ : @param $format An optional parameter denoting the format used to represent the time in the string, according to a sequence of 
+ : conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed by a single 
+ : letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion specification 
+ : is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ :
+ : <pre>
+ : '%H' Hours as decimal number (00-23).<br/>
+ : '%I' Hours as decimal number (01-12).<br/>
+ : '%M' Minute as decimal number (00-59).<br/>
+ : '%p' AM/PM indicator in the locale. Used in conjunction with '%I' and *not* with '%H'.<br/>
+ : '%S' Second as decimal number (00-61), allowing for up to two leap-seconds.<br/>
+ : '%X' Time, locale-specific.<br/>
+ : '%z' Offset from Greenwich, so '-0900' is 9 hours west of Greenwich.<br/>
+ : '%Z' Time zone as a character string.<br/>
+ : '%k' The 24-hour clock time with single digits preceded by a blank.<br/>
+ : '%l' The 12-hour clock time with single digits preceded by a blank.<br/>
+ : '%r' The 12-hour clock time (using the locale's AM or PM).<br/>
+ : '%R' Equivalent to '%H:%M'.<br/>
+ : '%T' Equivalent to '%H:%M:%S'.<br/>
+ :</pre>
+ :
+ : @return The time value resulting from the conversion.
+ : @example test/Queries/data-cleaning/normalization/to-time.xq
+ :)
+declare function normalization:to-time ( $sd as xs:string, $format as xs:string? ) as xs:string?{
+ let $timezoneDict := normalization:timeZone-dictionary()
+ let $format-string := replace(replace ($format, '%R', '%H:%M'), '%T', '%H:%M:%S')
+ let $format-tokens := tokenize($format-string, "( |%|:)+")[position()>1] 
+ let $sd-tokens := 
+ 	if (contains($sd, ":") or contains($sd, ".") or contains($sd, " "))
+ 	then tokenize ($sd, "[ :\.]")
+ 	else ()
+ return
+ if (count($sd-tokens)>1) 
+	then  	
+		let $hours := 
+			if (count(index-of($format-tokens, "T")) != 0) then string($sd-tokens[position() = 1])
+			else	
+			
+			if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),1,2)
+			else
+
+			if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 1])
+			else
+
+			if (count(index-of($format-tokens, "H")) != 0)
+			then string($sd-tokens[position() = index-of($format-tokens, "H")]) else 
+		
+			if (count(index-of($format-tokens, "k")) != 0)
+			then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "k")]))=1)
+			     then concat("0", string($sd-tokens[position() = index-of($format-tokens, "k")]))
+			     else string($sd-tokens[position() = index-of($format-tokens, "k")])  
+			else	
+			
+			if (count(index-of($format-tokens, "r")) != 0)
+			then 
+			  if(lower-case(string($sd-tokens[position() = 4]))="am")
+			  then string($sd-tokens[position() = 1])
+			  else if(lower-case(string($sd-tokens[position() = 4]))="pm")
+			       then if(string($sd-tokens[position() = 1])="12") then 12
+				  else string(number(string($sd-tokens[position() = 1]))+12)
+			       else()
+			
+			else
+
+			if (count(index-of($format-tokens, "I")) != 0)
+			then 
+			  if(count(index-of($format-tokens, "p")) !=0)
+			  then if (lower-case(string($sd-tokens[position() = 
+					index-of($format-tokens, "p")]))="am")
+			       then string($sd-tokens[position() = index-of($format-tokens, "I")]) 
+			       else if (lower-case(string($sd-tokens[position() = 
+					index-of($format-tokens, "p")]))="pm")
+				  then if (string($sd-tokens[position() = index-of($format-tokens, "I")])="12")
+				       then "12"
+				       else string(number(string($sd-tokens[position() = index-of($format-tokens, "I")]))+12)
+				  else()
+			  else()
+			
+			else
+			if (count(index-of($format-tokens, "l")) != 0)
+			then 
+			  if(count(index-of($format-tokens, "p")) !=0)
+			  then if (lower-case(string($sd-tokens[position() = 
+					index-of($format-tokens, "p")]))="am")
+			       then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "l")]))=1)
+				  then concat("0", string($sd-tokens[position() = index-of($format-tokens, "l")]))
+				  else string($sd-tokens[position() = index-of($format-tokens, "l")])  
+				
+			       else if (lower-case(string($sd-tokens[position() = index-of($format-tokens, "p")]))="pm")
+				  then if (string($sd-tokens[position() = index-of($format-tokens, "l")])="12")
+				       then "12"
+				       else string(number(string($sd-tokens[position() = index-of($format-tokens, "l")]))+12)
+				  else()
+			  			
+			else ()
+			
+			
+			else "HND"
+
+		let $minutes := 
+			
+			if (count(index-of($format-tokens, "T")) != 0)
+			then string($sd-tokens[position() = 2])	
+			else 
+			
+			if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),4,2)
+			else
+
+			if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 2])
+			else
+
+			if (count(index-of($format-tokens, "r")) != 0)
+			then string($sd-tokens[position() = 2])	
+			else 
+
+			if (count(index-of($format-tokens, "M")) != 0)
+			then string($sd-tokens[position() = index-of($format-tokens, "M")])
+			
+			else "MND"
+
+	      	let $seconds := 
+			if (count(index-of($format-tokens, "T")) != 0) 
+			then string($sd-tokens[position() = 3]) else 
+			
+			if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),7,2)
+			else
+
+			if (count(index-of($format-tokens, "r")) != 0)
+			then string($sd-tokens[position() = 3])	
+			else 
+
+			if (count(index-of($format-tokens, "R")) != 0) then "00"
+			else
+
+			if (count(index-of($format-tokens, "S")) != 0) 
+			then $sd-tokens[position() = index-of($format-tokens, "S")] else
+
+			if (count(index-of($format-tokens, "e")) != 0)
+			then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+			else "00"
+	
+	let $result :=
+
+	if (count(index-of($format-tokens, "Z")) != 0) 
+	then
+	   if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() = index-of($format-tokens, "Z")]]),1,1)='+')
+	   then let $complement := 
+		  if (number($minutes)+number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+			 index-of($format-tokens, "Z")]]),4,2)) > 59) 		  
+		  then 1 
+		  else 0
+
+		let $rhours := 
+		  if (string-length(string(
+			(number($complement) + number($hours) + 
+	                        number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2) 
+		  then (string(
+			(number($complement) + number($hours) + 
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))
+		  else concat("0",
+			string(
+			(number($complement) + number($hours) + 
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))	
+		  
+	          let $rminutes := 
+		  if (string-length(string(
+			(number($minutes)+
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2) 
+		  then (string(
+			(number($minutes)+
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60))
+		  else concat("0",
+			string(
+			(number($minutes)+
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60))
+
+			
+	          return concat($rhours, ":", $rminutes, ":", $seconds)
+	     else 
+	     
+	     if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() = 
+				index-of($format-tokens, "Z")]]),1,1)='-')
+	     then	
+		let $complement := 
+		  if (number($minutes)-number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2)) < 0) 		  
+		  then -1 
+		  else 0
+	
+		let $rhours :=
+		     if( ((number($complement) + number($hours) -
+			number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24) >= 0 )
+		     then
+		  	if (string-length(string(
+		  	    (number($complement) + number($hours) -
+	                          number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+		   	then (string(
+			  (number($complement) + number($hours) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))
+		  	else concat("0",
+		  	  string(
+			    (number($complement) + number($hours) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))
+		     else
+		  	if (string-length(string(
+		  	    (24 + number($complement) + number($hours) -
+	                          number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+		   	then (string(
+			  (24 + number($complement) + number($hours) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))
+		  	else concat("0",
+		  	  string(
+			    (24 + number($complement) + -(number($hours) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2)))) mod 24))
+		
+		let $rminutes := 
+		     if( ((number($minutes) -
+			number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60) >= 0 )
+		     then
+		  	if (string-length(string(
+		  	    (number($minutes) -
+	                          number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+		   	then (string(
+			  (number($minutes) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60))
+		  	else concat("0",
+		  	  string(
+			    (number($minutes) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60))
+		     else
+		  	if (string-length(string(
+		  	    (60 - -(number($minutes) -
+	                          number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2)))) mod 60)) = 2)
+		   	then (string(
+			  (60 - -(number($minutes) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2)))) mod 60))
+		  	else concat("0",
+		  	  string(
+			    (60 - -(number($minutes) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2)))) mod 60))
+
+		return concat($rhours, ":", $rminutes, ":", $seconds)
+	     else ()
+	else	
+
+
+	if (count(index-of($format-tokens, "z")) != 0) 
+	then if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='+')
+	     then let $complement := 
+		  if (number($minutes)+number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) > 59) 		  then 1 
+		  else 0
+
+		let $rhours := 
+		  if (string-length(string(
+			(number($complement) + number($hours) + 
+	                        number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2) 
+		  then (string(
+			(number($complement) + number($hours) + 
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+		  else concat("0",
+			string(
+			(number($complement) + number($hours) + 
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))	
+		  
+	          let $rminutes := 
+		  if (string-length(string(
+			(number($minutes)+
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2) 
+		  then (string(
+			(number($minutes)+
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+		  else concat("0",
+			string(
+			(number($minutes)+
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+
+			
+	          return concat($rhours, ":", $rminutes, ":", $seconds)
+	     else 
+	     
+	     if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='-')
+	     then	
+		let $complement := 
+		  if (number($minutes)-number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) < 0) 		  then -1 
+		  else 0
+	
+		let $rhours :=
+		     if( ((number($complement) + number($hours) -
+			number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24) >= 0 )
+		     then
+		  	if (string-length(string(
+		  	    (number($complement) + number($hours) -
+	                          number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+		   	then (string(
+			  (number($complement) + number($hours) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+		  	else concat("0",
+		  	  string(
+			    (number($complement) + number($hours) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+		     else
+		  	if (string-length(string(
+		  	    (24 + number($complement) + number($hours) -
+	                          number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+		   	then (string(
+			  (24 + number($complement) + number($hours) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+		  	else concat("0",
+		  	  string(
+			    (24 + number($complement) + -(number($hours) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 24))
+		
+		let $rminutes := 
+		     if( ((number($minutes) -
+			number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60) >= 0 )
+		     then
+		  	if (string-length(string(
+		  	    (number($minutes) -
+	                          number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+		   	then (string(
+			  (number($minutes) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+		  	else concat("0",
+		  	  string(
+			    (number($minutes) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+		     else
+		  	if (string-length(string(
+		  	    (60 - -(number($minutes) -
+	                          number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60)) = 2)
+		   	then (string(
+			  (60 - -(number($minutes) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60))
+		  	else concat("0",
+		  	  string(
+			    (60 - -(number($minutes) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 60))
+
+		return concat($rhours, ":", $rminutes, ":", $seconds)
+	     else ()
+	else
+	 concat($hours, ":", $minutes, ":", $seconds)
+	
+	return 
+	
+	normalization:check-time($result)
+	else()
+
+};
+
+(:~
+ : Converts a given string representation of a dateTime value into a dateTime representation 
+ : valid according to the corresponding XML Schema type.
+ :
+ :
+ : @param $sd The string representation for the dateTime.
+ : @param $format An optional parameter denoting the format used to represent the dateTime in the string, according to a sequence 
+ : of conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed by a single 
+ : letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion specification 
+ : is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ :
+ : <pre>
+ : '%b' Abbreviated month name in the current locale.<br/>
+ : '%B' Full month name in the current locale.<br/>
+ : '%c' Date and time, locale-specific.<br/>
+ : '%C' Century (00-99): the integer part of the year divided by 100.<br/>
+ : '%d' Day of the month as decimal number (01-31).<br/>
+ : '%H' Hours as decimal number (00-23).<br/>
+ : '%I' Hours as decimal number (01-12).<br/>
+ : '%j' Day of year as decimal number (001-366).<br/>
+ : '%m' Month as decimal number (01-12).<br/>
+ : '%M' Minute as decimal number (00-59).<br/>
+ : '%p' AM/PM indicator in the locale. Used in conjunction with '%I' and *not* with '%H'.<br/>
+ : '%S' Second as decimal number (00-61), allowing for up to two leap-seconds.<br/>
+ : '%x' Date, locale-specific.<br/>
+ : '%X' Time, locale-specific.<br/>
+ : '%y' Year without century (00-99).<br/>
+ : '%Y' Year with century.<br/>
+ : '%z' Offset from Greenwich, so '-0900' is 9 hours west of Greenwich.<br/>
+ : '%Z' Time zone as a character string.<br/>
+ : '%D' Locale-specific date format such as '%m/%d/%y': ISO C99 says it should be that exact format.<br/>
+ : '%e' Day of the month as decimal number (1-31), with a leading pace for a single-digit number.<br/>
+ : '%F' Equivalent to %Y-%m-%d (the ISO 8601 date format).<br/>
+ : '%g' The last two digits of the week-based year (see '%V').<br/>
+ : '%G' The week-based year (see '%V') as a decimal number.<br/>
+ : '%h' Equivalent to '%b'.<br/>
+ : '%k' The 24-hour clock time with single digits preceded by a blank.<br/>
+ : '%l' The 12-hour clock time with single digits preceded by a blank.<br/>
+ : '%r' The 12-hour clock time (using the locale's AM or PM).<br/>
+ : '%R' Equivalent to '%H:%M'.<br/>
+ : '%T' Equivalent to '%H:%M:%S'.<br/>
+ :</pre>
+ :
+ : @return The dateTime value resulting from the conversion.
+ : @example test/Queries/data-cleaning/normalization/to-dateTime.xq
+ :)
+declare function normalization:to-dateTime ( $sd as xs:string, $format as xs:string? ) as xs:string {
+  let $timezoneDict := normalization:timeZone-dictionary()
+  let $monthDict := normalization:month-dictionary()
+  let $format-string := replace(replace(replace ($format, '%R', '%H:%M'), '%T', '%H:%M:%S'), '%F', '%Y-%m-%d')  
+  let $format-tokens := tokenize($format-string, "[ %\-/:\.]+")[position()>1]  
+  let $sdt := 
+ 	if (contains($sd, ":") or contains($sd, ".") or contains($sd, " ") or contains($sd, "-") 
+		or contains($sd, "/"))
+ 	then tokenize ($sd, "[ \-/:\.]+")
+ 	else ()
+  let $sdtok :=  
+    	if ((count(index-of($format-tokens, "z")) != 0) and (not(contains($sdt[last()], "+"))))
+          then ($sdt[position() != last()], concat("-", $sdt[position() = last()]))
+          else $sdt
+  let $sd-tokens :=
+    for $a in $sdtok
+       return
+  	if (matches($a, "[0-9][0-9][A-Za-z]+[0-9][0-9]+"))
+          then (let $ydtoken := tokenize(replace($a, "[A-Za-z]", " "), " ")
+	     let $ft := $ydtoken[position()=1]
+	     let $lt := $ydtoken[last()]
+	     let $mtoken := replace($a, "[0-9]", "") return ($ft, $mtoken, $lt))
+          else $a
+  let $timeFormat := tokenize($format, "[ :\.\-]")[position()>1]
+  let $dateFormat := tokenize($format, "[ :\.\-]")[position()=1]
+   return
+ if (count($sd-tokens)>1) 
+	then  	
+		(:Date:)
+		let $year := 
+			if (count(index-of($format-tokens, "F")) != 0) then string($sd-tokens[position() = 1])
+			else	
+			
+			if (count(index-of($format-tokens, "D")) != 0) then concat("19", string($sd-tokens[position() = 3]))
+			else
+
+			if (count(index-of($format-tokens, "Y")) != 0)
+			then string($sd-tokens[position() = index-of($format-tokens, "Y")]) else 
+				
+			if (count(index-of($format-tokens, "y")) != 0)
+			then 
+			  if(count(index-of($format-tokens, "C")) !=0)
+			  then concat(string(number(string($sd-tokens[position() = index-of($format-tokens, "C")]))-1), string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+			  else 
+			   concat("19", string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+			else "YND"
+
+		let $month := 
+			if (count(index-of($format-tokens, "h")) != 0)
+			then string($monthDict//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "h")]]/@value) else
+			
+			if (count(index-of($format-tokens, "b")) != 0)
+			then string($monthDict//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "b")]]/@value)
+			else 
+			
+			if (count(index-of($format-tokens, "B")) != 0)
+			then string($monthDict//month[lower-case(@name) = 
+				lower-case($sd-tokens[position() = index-of($format-tokens, "B")])]/@value)
+			   
+			else 
+
+			if (count(index-of($format-tokens, "F")) != 0)
+			then string($sd-tokens[position() = 2])	
+			else 
+			
+			if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 1])
+			else
+
+			if (count(index-of($format-tokens, "m")) != 0)
+			then string($sd-tokens[position() = index-of($format-tokens, "m")])
+			
+			else "MND"
+
+	      	let $day := 
+			if (count(index-of($format-tokens, "F")) != 0) 
+			then string($sd-tokens[position() = 3]) else 
+			
+			if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 2])
+			else
+  
+			if (count(index-of($format-tokens, "d")) != 0) 
+			then $sd-tokens[position() = index-of($format-tokens, "d")] else
+
+			if (count(index-of($format-tokens, "e")) != 0)
+			then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+			else "DND"
+	
+		(:Time:)
+		let $hours := 
+			if (count(index-of($format-tokens, "T")) != 0) then string($sd-tokens[position() = 1])
+			else	
+			
+			if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),1,2)
+			else
+
+			if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 1])
+			else
+
+			if (count(index-of($format-tokens, "H")) != 0)
+			then string($sd-tokens[position() = index-of($format-tokens, "H")]) else 
+		
+			if (count(index-of($format-tokens, "k")) != 0)
+			then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "k")]))=1)
+			     then concat("0", string($sd-tokens[position() = index-of($format-tokens, "k")]))
+			     else string($sd-tokens[position() = index-of($format-tokens, "k")])  
+			else	
+			
+			if (count(index-of($format-tokens, "r")) != 0)
+			then 
+			  if(lower-case(string($sd-tokens[position() = 4]))="am")
+			  then string($sd-tokens[position() = 1])
+			  else if(lower-case(string($sd-tokens[position() = 4]))="pm")
+			       then if(string($sd-tokens[position() = 1])="12") then 12
+				  else string(number(string($sd-tokens[position() = 1]))+12)
+			       else()
+			
+			else
+
+			if (count(index-of($format-tokens, "I")) != 0)
+			then 
+			  if(count(index-of($format-tokens, "p")) !=0)
+			  then if (lower-case(string($sd-tokens[position() = 
+					index-of($format-tokens, "p")]))="am")
+			       then string($sd-tokens[position() = index-of($format-tokens, "I")]) 
+			       else if (lower-case(string($sd-tokens[position() = 
+					index-of($format-tokens, "p")]))="pm")
+				  then if (string($sd-tokens[position() = index-of($format-tokens, "I")])="12")
+				       then "12"
+				       else string(number(string($sd-tokens[position() = index-of($format-tokens, "I")]))+12)
+				  else()
+			  else()
+			
+			else
+			if (count(index-of($format-tokens, "l")) != 0)
+			then 
+			  if(count(index-of($format-tokens, "p")) !=0)
+			  then if (lower-case(string($sd-tokens[position() = 
+					index-of($format-tokens, "p")]))="am")
+			       then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "l")]))=1)
+				  then concat("0", string($sd-tokens[position() = index-of($format-tokens, "l")]))
+				  else string($sd-tokens[position() = index-of($format-tokens, "l")])  
+				
+			       else if (lower-case(string($sd-tokens[position() = index-of($format-tokens, "p")]))="pm")
+				  then if (string($sd-tokens[position() = index-of($format-tokens, "l")])="12")
+				       then "12"
+				       else string(number(string($sd-tokens[position() = index-of($format-tokens, "l")]))+12)
+				  else()
+			  			
+			else ()
+			
+			
+			else "HND"
+
+		let $minutes := 
+			
+			if (count(index-of($format-tokens, "T")) != 0)
+			then string($sd-tokens[position() = 2])	
+			else 
+			
+			if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),4,2)
+			else
+
+			if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 2])
+			else
+
+			if (count(index-of($format-tokens, "r")) != 0)
+			then string($sd-tokens[position() = 2])	
+			else 
+
+			if (count(index-of($format-tokens, "M")) != 0)
+			then string($sd-tokens[position() = index-of($format-tokens, "M")])
+			
+			else "MND"
+
+	      	let $seconds := 
+			if (count(index-of($format-tokens, "T")) != 0) 
+			then string($sd-tokens[position() = 3]) else 
+			
+			if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),7,2)
+			else
+
+			if (count(index-of($format-tokens, "r")) != 0)
+			then string($sd-tokens[position() = 3])	
+			else 
+
+			if (count(index-of($format-tokens, "R")) != 0) then "00"
+			else
+
+			if (count(index-of($format-tokens, "S")) != 0) 
+			then $sd-tokens[position() = index-of($format-tokens, "S")] else
+
+			if (count(index-of($format-tokens, "e")) != 0)
+			then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+			else "00"
+	
+	let $result :=
+
+	if (count(index-of($format-tokens, "Z")) != 0) 
+	then
+	   if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() = index-of($format-tokens, "Z")]]),1,1)='+')
+	   then let $complement := 
+		  if (number($minutes)+number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+			 index-of($format-tokens, "Z")]]),4,2)) > 59) 		  
+		  then 1 
+		  else 0
+
+		let $dayscomplement := 
+		  if (number($complement) + number($hours) + number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() 			=index-of($format-tokens, "Z")]]),2,2)) >= 24)
+		  then 1
+		  else 0
+
+		let $monthscomplement :=
+		  if(($dayscomplement + number($day) > 28) and (compare($month, '02') = 0) and (number($year) mod 4 != 0))
+		  then 1  
+		  else 
+		    if(($dayscomplement + number($day) > 30) and ((compare($month, '04') = 0) or (compare($month, '06') = 0) or (compare($month, '09') = 0) or (compare($month, '11') = 0)))
+		    then 1
+		    else 
+		      if(($dayscomplement + number($day) > 31) and ((compare($month, '04') = 0) or (compare($month, '01') = 0) or (compare($month, '03') = 0) or (compare($month, '05') = 0) or (compare($month, '07') = 0) or (compare($month, '08') = 0) or (compare($month, '10') = 0) or (compare($month, '12') = 0)))
+		      then 1
+		      else 
+		        if(($dayscomplement + number($day) > 29) and (compare($month, '02') = 0) and (number($year) mod 4 = 0))
+		        then 1
+		        else 0
+
+		let $ryear := 
+		  if ($monthscomplement + number($month) > 12)
+		  then string(number($year) + 1)
+		  else $year
+
+		let $daywcompl := 
+		  if ($monthscomplement = 1)
+		  then 1
+		  else number($day) + $dayscomplement
+
+		let $monthwcompl :=
+		  if($monthscomplement + number($month) <= 12)
+		  then number($month) + $monthscomplement		
+		  else 1
+ 
+		let $rday := 
+		  if (string-length(string($daywcompl)) = 1)
+		  then concat ('0', string($daywcompl))
+		  else string($daywcompl)
+
+		let $rmonth :=
+		  if (string-length(string($monthwcompl)) = 1)
+		  then concat ('0', string($monthwcompl))
+		  else string($monthwcompl)
+
+		let $rhours := 
+		  if (string-length(string(
+			(number($complement) + number($hours) + 
+	                        number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2) 
+		  then (string(
+			(number($complement) + number($hours) + 
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))
+		  else concat("0",
+			string(
+			(number($complement) + number($hours) + 
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))	
+		  
+	          let $rminutes := 
+		  if (string-length(string(
+			(number($minutes)+
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2) 
+		  then (string(
+			(number($minutes)+
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60))
+		  else concat("0",
+			string(
+			(number($minutes)+
+			    number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60))
+
+			
+	          return concat($ryear, "-", $rmonth, "-", $rday, "T", $rhours, ":", $rminutes, ":", $seconds)
+	     else 
+	     
+	     if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() = 
+				index-of($format-tokens, "Z")]]),1,1)='-')
+	     then	
+		let $complement := 
+		  if (number($minutes)-number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2)) < 0) 		  
+		  then -1 
+		  else 0
+	
+		let $dayscomplement := 
+		  if (number($complement) - number($hours) - number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position()=
+				 index-of($format-tokens, "Z")]]),2,2)) < 0)
+		  then -1
+		  else 0
+
+		let $monthcomplement :=
+		  if(number($day) + $dayscomplement < 1)
+		  then -1
+		  else 0
+	
+		let $yearcomplement :=
+		  if(number($month) + $monthcomplement< 1)
+		  then -1
+		  else 0
+
+		let $daywcompl := 
+		  if ($monthcomplement = 0)
+		  then number($day) + $dayscomplement 
+		  else 
+		   if ( (number($month) = 5) or (number($month) = 7) or (number($month) = 10) or (number($month) = 12))
+		   then 30
+		   else 
+		    if((number($month) = 4) or (number($month) = 6) or (number($month) = 9) or (number($month) = 11) or (number($month) = 2) or 			(number($month) = 1) or (number($month) = 8))
+		    then 31
+		    else 
+		      if((number($month) = 3) and (number($year) mod 4 != 0))
+		      then 28
+		      else 
+		        if((number($month) = 3) and (number($year) mod 4 = 0))
+		        then 29
+		        else number($day) + $dayscomplement
+	  	      	
+		let $monthwcompl:=
+		  if($yearcomplement = 0)
+		  then number($month) + $monthcomplement
+		  else 12
+
+		let $ryear := 
+		  number($year) + $yearcomplement
+
+		let $rday := 
+		  if (string-length(string($daywcompl)) = 1)
+		  then concat ('0', string($daywcompl))
+		  else string($daywcompl)
+
+		let $rmonth :=
+		  if (string-length(string($monthwcompl)) = 1)
+		  then concat ('0', string($monthwcompl))
+		  else string($monthwcompl)
+
+		let $rhours :=
+		     if( ((number($complement) + number($hours) -
+			number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24) >= 0 )
+		     then
+		  	if (string-length(string(
+		  	    (number($complement) + number($hours) -
+	                          number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+		   	then (string(
+			  (number($complement) + number($hours) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))
+		  	else concat("0",
+		  	  string(
+			    (number($complement) + number($hours) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))
+		     else
+		  	if (string-length(string(
+		  	    (24 + number($complement) + number($hours) -
+	                          number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+		   	then (string(
+			  (24 + number($complement) + number($hours) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2))) mod 24))
+		  	else concat("0",
+		  	  string(
+			    (24 + number($complement) + -(number($hours) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2)))) mod 24))
+		
+		let $rminutes := 
+		     if( ((number($minutes) -
+			number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60) >= 0 )
+		     then
+		  	if (string-length(string(
+		  	    (number($minutes) -
+	                          number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+		   	then (string(
+			  (number($minutes) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60))
+		  	else concat("0",
+		  	  string(
+			    (number($minutes) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2))) mod 60))
+		     else
+		  	if (string-length(string(
+		  	    (60 - -(number($minutes) -
+	                          number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2)))) mod 60)) = 2)
+		   	then (string(
+			  (60 - -(number($minutes) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),4,2)))) mod 60))
+		  	else concat("0",
+		  	  string(
+			    (60 - -(number($minutes) - 
+			      number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+				 index-of($format-tokens, "Z")]]),2,2)))) mod 60))
+
+		return concat($ryear, "-", $rmonth, "-", $rday, "T", $rhours, ":", $rminutes, ":", $seconds)
+	     else ()
+	else	
+
+
+	if (count(index-of($format-tokens, "z")) != 0) 
+	then if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='+')
+	     then let $complement := 
+		  if (number($minutes)+number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) > 59) 		  	  then 1 
+		  else 0
+
+		let $dayscomplement := 
+		  if (number($complement) + number($hours) + number(substring(string($sd-tokens[position() = 					index-of($format-tokens, "z")]),2,2)) >= 24)
+		  then 1
+		  else 0
+
+		let $monthscomplement :=
+		  if(($dayscomplement + number($day) > 28) and (compare($month, '02') = 0) and (number($year) mod 4 != 0))
+		  then 1  
+		  else 
+		    if(($dayscomplement + number($day) > 30) and ((compare($month, '04') = 0) or (compare($month, '06') = 0) or (compare($month, '09') = 0) or (compare($month, '11') = 0)))
+		    then 1
+		    else 
+		      if(($dayscomplement + number($day) > 31) and ((compare($month, '04') = 0) or (compare($month, '01') = 0) or (compare($month, '03') = 0) or (compare($month, '05') = 0) or (compare($month, '07') = 0) or (compare($month, '08') = 0) or (compare($month, '10') = 0) or (compare($month, '12') = 0)))
+		      then 1
+		      else 
+		        if(($dayscomplement + number($day) > 29) and (compare($month, '02') = 0) and (number($year) mod 4 = 0))
+		        then 1
+		        else 0
+
+		let $ryear := 
+		  if ($monthscomplement + number($month) > 12)
+		  then string(number($year) + 1)
+		  else $year
+
+		let $daywcompl := 
+		  if ($monthscomplement = 1)
+		  then 1
+		  else number($day) + $dayscomplement
+
+		let $monthwcompl :=
+		  if($monthscomplement + number($month) <= 12)
+		  then number($month) + $monthscomplement		
+		  else 1
+ 
+		let $rday := 
+		  if (string-length(string($daywcompl)) = 1)
+		  then concat ('0', string($daywcompl))
+		  else string($daywcompl)
+
+		let $rmonth :=
+		  if (string-length(string($monthwcompl)) = 1)
+		  then concat ('0', string($monthwcompl))
+		  else string($monthwcompl)
+
+		let $rhours := 
+		  if (string-length(string(
+			(number($complement) + number($hours) + 
+	                        number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2) 
+		  then (string(
+			(number($complement) + number($hours) + 
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+		  else concat("0",
+			string(
+			(number($complement) + number($hours) + 
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))	
+		  
+	          let $rminutes := 
+		  if (string-length(string(
+			(number($minutes)+
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2) 
+		  then (string(
+			(number($minutes)+
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+		  else concat("0",
+			string(
+			(number($minutes)+
+			    number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+
+			
+	          return concat($ryear, "-", $rmonth, "-", $rday, "T", $rhours, ":", $rminutes, ":", $seconds)
+	     else 
+	     
+	     if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='-')
+	     then	
+		let $complement := 
+		  if (number($minutes)-number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) < 0) 		  	  then -1 
+		  else 0
+	
+		let $dayscomplement := 
+		  if (number($complement) - number($hours) - number(substring(string($sd-tokens[position() = 					index-of($format-tokens, "z")]),2,2)) < 0)
+		  then -1
+		  else 0
+
+		let $monthcomplement :=
+		  if(number($day) + $dayscomplement< 1)
+		  then -1
+		  else 0
+	
+		let $yearcomplement :=
+		  if(number($month) + $monthcomplement< 1)
+		  then -1
+		  else 0
+
+		let $daywcompl := 
+		  if ($monthcomplement = 0)
+		  then number($day) + $dayscomplement 
+		  else 
+		   if ( (number($month) = 5) or (number($month) = 7) or (number($month) = 10) or (number($month) = 12))
+		   then 30
+		   else 
+		    if((number($month) = 4) or (number($month) = 6) or (number($month) = 9) or (number($month) = 11) or (number($month) = 2) or 			(number($month) = 1) or (number($month) = 8))
+		    then 31
+		    else 
+		      if((number($month) = 3) and (number($year) mod 4 != 0))
+		      then 28
+		      else 
+		        if((number($month) = 3) and (number($year) mod 4 = 0))
+		        then 29
+		        else number($day) + $dayscomplement
+	  	      	
+		let $monthwcompl:=
+		  if($yearcomplement = 0)
+		  then number($month) + $monthcomplement
+		  else 12
+
+		let $ryear := 
+		  number($year) + $yearcomplement
+
+		let $rday := 
+		  if (string-length(string($daywcompl)) = 1)
+		  then concat ('0', string($daywcompl))
+		  else string($daywcompl)
+
+		let $rmonth :=
+		  if (string-length(string($monthwcompl)) = 1)
+		  then concat ('0', string($monthwcompl))
+		  else string($monthwcompl)
+
+		let $rhours :=
+		     if( ((number($complement) + number($hours) -
+			number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24) >= 0 )
+		     then
+		  	if (string-length(string(
+		  	    (number($complement) + number($hours) -
+	                          number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+		   	then (string(
+			  (number($complement) + number($hours) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+		  	else concat("0",
+		  	  string(
+			    (number($complement) + number($hours) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+		     else
+		  	if (string-length(string(
+		  	    (24 + number($complement) + number($hours) -
+	                          number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+		   	then (string(
+			  (24 + number($complement) + number($hours) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+		  	else concat("0",
+		  	  string(
+			    (24 + number($complement) + -(number($hours) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 24))
+		
+		let $rminutes := 
+		     if( ((number($minutes) -
+			number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60) >= 0 )
+		     then
+		  	if (string-length(string(
+		  	    (number($minutes) -
+	                          number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+		   	then (string(
+			  (number($minutes) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+		  	else concat("0",
+		  	  string(
+			    (number($minutes) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+		     else
+		  	if (string-length(string(
+		  	    (60 - -(number($minutes) -
+	                          number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60)) = 2)
+		   	then (string(
+			  (60 - -(number($minutes) - 
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60))
+		  	else concat("0",
+		  	  string(
+			    (60 - -(number($minutes) -
+			      number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 60))
+
+		return concat($ryear, "-", $rmonth, "-", $rday, "T", $rhours, ":", $rminutes, ":", $seconds)
+	     else ()
+	else
+	 concat($year, "-", $month, "-", $day, "T", $hours, ":", $minutes, ":", $seconds)
+
+	return 
+	normalization:check-dateTime($result)
+	else()
+};
+
+(:~
+ : Uses an address normalization Web service to convert a postal address given as input into a 
+ : cannonical representation format.
+ :
+ : 
+ : @param $addr A sequence of strings encoding an address, where each string in the sequence corresponds to a different component (e.g., street, city, country, etc.) of the address.
+ : @return A sequence of strings with the address encoded in a cannonical format, where each string in the sequence corresponds to a different component (e.g., street, city, country, etc.) of the address.
+ : @example test/Queries/data-cleaning/normalization/normalize-address.xq
+ :)
+declare %ann:nondeterministic  function normalization:normalize-address ( $addr as xs:string* ) as xs:string* {
+
+  let $id   := ""
+  let $url  := "http://where.yahooapis.com/geocode?q=";
+  let $q2   := string-join(for $i in $addr return translate($i," ","+"),",")
+  let $call := concat($url,$q2,"&amp;appid=",$id)
+  let $doc  := http:get-node($call)[2]
+  return distinct-values( (if (string-length($doc//xs:string(*:country)) > 0) then ($doc//xs:string(*:country)) else (),
+                          if (string-length($doc//xs:string(*:state)) > 0) then ($doc//xs:string(*:state)) else (),
+                          if (string-length($doc//xs:string(*:county)) > 0) then ($doc//xs:string(*:county)) else (),
+                          if (string-length($doc//xs:string(*:city)) > 0) then ($doc//xs:string(*:city)) else (),
+			  if (string-length($doc//xs:string(*:neighborhood)) > 0) then ($doc//xs:string(*:neighborhood)) else (),
+                          if (string-length($doc//xs:string(*:street)) > 0) then ($doc//xs:string(*:street)) else (),
+                          if (string-length($doc//xs:string(*:house)) > 0) then ($doc//xs:string(*:house)) else () ) )
+};
+
+(:~
+ : Uses an phone number normalization Web service to convert a phone number given as input into a 
+ : cannonical representation.
+ : 
+ : @param $phone A strings encoding a phone number.
+ : @return A strings with the phone number encoded in a cannonical format.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function normalization:normalize-phone ( $addr as xs:string* ) as xs:string* {
+ ()
+};
+
+(:~
+ : Internal auxiliary function that returns an XML representation for a dictionary that contains the 
+ : time-shift value associated to different time-zone abbreviations.
+ :)
+declare %private function normalization:timeZone-dictionary() as element(){
+	let $result :=
+	<dictionary>
+		<timeZone name="A" value="+0100"/>
+		<timeZone name="ADT" value="-0300"/>
+		<timeZone name="AFT" value="+0430"/>
+		<timeZone name="AKDT" value="-0800"/>
+		<timeZone name="AKST" value="-0900"/>
+		<timeZone name="ALMT" value="+0600"/>
+		<timeZone name="AMST" value="+0500"/>
+		<!--<timeZone name="AMST" value="-0300"/>-->
+		<timeZone name="AMT" value="+0400"/>
+		<!--<timeZone name="AMT" value="-0400"/>-->
+		<timeZone name="ANAST" value="+1200"/>
+		<timeZone name="ANAT" value="+1200"/>
+		<timeZone name="AQTT" value="+0500"/>
+		<timeZone name="ART" value="-0300"/>
+		<timeZone name="AST" value="-0400"/>
+		<timeZone name="AZOST" value="+0000"/>
+		<timeZone name="AZOT" value="-0100"/>
+		<timeZone name="AZST" value="+0500"/>
+		<timeZone name="AZT" value="+0400"/>
+		<timeZone name="B" value="+0200"/>
+		<timeZone name="BNT" value="+0800"/>
+		<timeZone name="BOT" value="-0400"/>
+		<timeZone name="BRST" value="-0200"/>
+		<timeZone name="BRT" value="-0300"/>
+		<!--<timeZone name="BST" value="+0600"/>-->
+		<timeZone name="BST" value="+0100"/>
+		<timeZone name="BTT" value="+0600"/>
+		<timeZone name="C" value="+0300"/>
+		<timeZone name="CAST" value="+0800"/>
+		<timeZone name="CAT" value="+0200"/>
+		<timeZone name="CCT" value="+0630"/>
+		<!--<timeZone name="CDT" value="+1030"/>-->
+		<!--<timeZone name="CDT" value="-0400"/>-->
+		<timeZone name="CDT" value="-0500"/>
+		<timeZone name="CEST" value="+0200"/>
+		<timeZone name="CET" value="+0100"/>
+		<timeZone name="CHADT" value="+1345"/>
+		<timeZone name="CHAST" value="+1245"/>
+		<timeZone name="CKT" value="-1000"/>
+		<timeZone name="CLST" value="-0300"/>
+		<timeZone name="CLT" value="-0400"/>
+		<timeZone name="COT" value="-0500"/>
+		<!--<timeZone name="CST" value="+0800"/>-->
+		<!--<timeZone name="CST" value="+0930"/>-->
+		<!--<timeZone name="CST" value="-0600"/>-->
+		<!--<timeZone name="CST" value="-0500"/>-->
+		<timeZone name="CST" value="-0600"/>
+		<timeZone name="CVT" value="-0100"/>
+		<timeZone name="CXT" value="+0700"/>
+		<timeZone name="ChST" value="+1000"/>
+		<timeZone name="D" value="+0400"/>
+		<timeZone name="DAVT" value="+0700"/>
+		<timeZone name="E" value=""/>
+		<timeZone name="EASST" value="-0500"/>
+		<timeZone name="EAST" value="-0600"/>
+		<timeZone name="EAT" value="+0300"/>
+		<timeZone name="ECT" value="-0500"/>
+		<!--<timeZone name="EDT" value="+1100"/>-->
+		<timeZone name="EDT" value="-0400"/>
+		<timeZone name="EEST" value="+0300"/>
+		<timeZone name="EET" value="+0200"/>
+		<timeZone name="EGST" value="+0000"/>
+		<timeZone name="EGT" value="-0100"/>
+		<timeZone name="EST" value="+1000"/>
+		<!--<timeZone name="EST" value="-0500"/>-->
+		<timeZone name="ET" value="-0500"/>
+		<timeZone name="F" value="+0600"/>
+		<timeZone name="FJST" value="+1300"/>
+		<timeZone name="FJT" value="+1200"/>
+		<timeZone name="FKST" value="-0300"/>
+		<timeZone name="FKT" value="-0400"/>
+		<timeZone name="FNT" value="-0200"/>
+		<timeZone name="G" value="+0700"/>
+		<timeZone name="GALT" value="-0600"/>
+		<timeZone name="GAMT" value="-0900"/>
+		<timeZone name="GET" value="+0400"/>
+		<timeZone name="GFT" value="-0300"/>
+		<timeZone name="GILT" value="+1200"/>
+		<timeZone name="GMT" value="+0000"/>
+		<timeZone name="GST" value="+0400"/>
+		<timeZone name="GYT" value="-0400"/>
+		<timeZone name="H" value="+0800"/>
+		<timeZone name="HAA" value="-0300"/>
+		<timeZone name="HAC" value="-0500"/>
+		<timeZone name="HADT" value="-0900"/>
+		<timeZone name="HAE" value="-0400"/>
+		<timeZone name="HAP" value="-0700"/>
+		<timeZone name="HAR" value="-0600"/>
+		<timeZone name="HAST" value="-1000"/>
+		<timeZone name="HAT" value="-0230"/>
+		<timeZone name="HAY" value="-0800"/>
+		<timeZone name="HKT" value="+0800"/>
+		<timeZone name="HLV" value="-0430"/>
+		<timeZone name="HNA" value="-0400"/>
+		<timeZone name="HNC" value="-0600"/>
+		<timeZone name="HNE" value="-0500"/>
+		<timeZone name="HNP" value="-0800"/>
+		<timeZone name="HNR" value="-0700"/>
+		<timeZone name="HNT" value="-0330"/>
+		<timeZone name="I" value="+0900"/>
+		<timeZone name="ICT" value="+0700"/>
+		<timeZone name="IDT" value="+0300"/>
+		<timeZone name="IOT" value="+0600"/>
+		<timeZone name="IRDT" value="+0430"/>
+		<timeZone name="IRKST" value="+0900"/>
+		<timeZone name="IRKT" value="+0800"/>
+		<timeZone name="IRST" value="+0330"/>
+		<!--<timeZone name="IST" value="+0200"/>-->
+		<timeZone name="IST" value="+0530"/>
+		<!--<timeZone name="IST" value="+0100"/>-->
+		<timeZone name="JST" value="+0900"/>
+		<timeZone name="K" value="+1000"/>
+		<timeZone name="KGT" value="+0600"/>
+		<timeZone name="KRAST" value="+0800"/>
+		<timeZone name="KRAT" value="+0700"/>
+		<timeZone name="KST" value="+0900"/>
+		<timeZone name="KUYT" value="+0400"/>
+		<timeZone name="L" value="+1100"/>
+		<timeZone name="LHDT" value="+1100"/>
+		<timeZone name="LHST" value="+10:30"/>
+		<timeZone name="LINT" value="+1400"/>
+		<timeZone name="M" value="+1200"/>
+		<timeZone name="MAGST" value="+1200"/>
+		<timeZone name="MAGT" value="+1100"/>
+		<timeZone name="MART" value="-0930"/>
+		<timeZone name="MAWT" value="+0500"/>
+		<timeZone name="MDT" value="-0600"/>
+		<timeZone name="MHT" value="+1200"/>
+		<timeZone name="MMT" value="+0630"/>
+		<timeZone name="MSD" value="+0400"/>
+		<timeZone name="MSK" value="+0300"/>
+		<timeZone name="MST" value="-0700"/>
+		<timeZone name="MUT" value="+0400"/>
+		<timeZone name="MVT" value="+0500"/>
+		<timeZone name="MYT" value="+0800"/>
+		<timeZone name="N" value="-0100"/>
+		<timeZone name="NCT" value="+1100"/>
+		<timeZone name="NDT" value="-0230"/>
+		<timeZone name="NFT" value="+1130"/>
+		<timeZone name="NOVST" value="+0700"/>
+		<timeZone name="NOVT" value="+0600"/>
+		<timeZone name="NPT" value="+0545"/>
+		<timeZone name="NST" value="-0330"/>
+		<timeZone name="NUT" value="-1100"/>
+		<timeZone name="NZDT" value="+1300"/>
+		<timeZone name="NZST" value="+1200"/>
+		<timeZone name="O" value="-0200"/>
+		<timeZone name="OMSST" value="+0700"/>
+		<timeZone name="OMST" value="+0600"/>
+		<timeZone name="P" value="-0300"/>
+		<timeZone name="PDT" value="-0700"/>
+		<timeZone name="PET" value="-0500"/>
+		<timeZone name="PETST" value="+1200"/>
+		<timeZone name="PETT" value="+1200"/>
+		<timeZone name="PGT" value="+1000"/>
+		<timeZone name="PHOT" value="+1300"/>
+		<timeZone name="PHT" value="+0800"/>
+		<timeZone name="PKT" value="+0500"/>
+		<timeZone name="PMDT" value="-0200"/>
+		<timeZone name="PMST" value="-0300"/>
+		<timeZone name="PONT" value="+1100"/>
+		<timeZone name="PST" value="-0800"/>
+		<timeZone name="PT" value="-0800"/>
+		<timeZone name="PWT" value="+0900"/>
+		<timeZone name="PYST" value="-0300"/>
+		<timeZone name="PYT" value="-0400"/>
+		<timeZone name="Q" value="-0400"/>
+		<timeZone name="R" value="-0500"/>
+		<timeZone name="RET" value="+0400"/>
+		<timeZone name="S" value="-0600"/>
+		<timeZone name="SAMT" value="+0400"/>
+		<timeZone name="SAST" value="+0200"/>
+		<timeZone name="SBT" value="+1100"/>
+		<timeZone name="SCT" value="+0400"/>
+		<timeZone name="SGT" value="+0800"/>
+		<timeZone name="SRT" value="-0300"/>
+		<timeZone name="SST" value="-1100"/>
+		<timeZone name="T" value="-0700"/>
+		<timeZone name="TAHT" value="-1000"/>
+		<timeZone name="TFT" value="+0500"/>
+		<timeZone name="TJT" value="+0500"/>
+		<timeZone name="TKT" value="-1000"/>
+		<timeZone name="TLT" value="+0900"/>
+		<timeZone name="TMT" value="+0500"/>
+		<timeZone name="TVT" value="+1200"/>
+		<timeZone name="U" value="-0800"/>
+		<timeZone name="ULAT" value="+0800"/>
+		<timeZone name="UTC" value="+0000"/>
+		<timeZone name="UYST" value="-0200"/>
+		<timeZone name="UYT" value="-0300"/>
+		<timeZone name="UZT" value="+0500"/>
+		<timeZone name="V" value="-0900"/>
+		<timeZone name="VET" value="-0430"/>
+		<timeZone name="VLAST" value="+1100"/>
+		<timeZone name="VLAT" value="+1000"/>
+		<timeZone name="VUT" value="+1100"/>
+		<timeZone name="W" value="-1000"/>
+		<timeZone name="WAST" value="+0200"/>
+		<timeZone name="WAT" value="+0100"/>
+		<timeZone name="WDT" value="+0900"/>
+		<timeZone name="WEST" value="+0100"/>
+		<timeZone name="WET" value="+0000"/>
+		<timeZone name="WFT" value="+1200"/>
+		<timeZone name="WGST" value="-0200"/>
+		<timeZone name="WGT" value="-0300"/>
+		<timeZone name="WIB" value="+0700"/>
+		<timeZone name="WIT" value="+0900"/>
+		<timeZone name="WITA" value="+0800"/>
+		<!--<timeZone name="WST" value="+0100"/>-->
+		<!--<timeZone name="WST" value="-1100"/>-->
+		<timeZone name="WST" value="+0800"/>
+		<timeZone name="WT" value="+0000"/>
+		<timeZone name="X" value="-1100"/>
+		<timeZone name="Y" value="-1200"/>
+		<timeZone name="YAKST" value="+1000"/>
+		<timeZone name="YAKT" value="+0900"/>
+		<timeZone name="YAPT" value="+1000"/>
+		<timeZone name="YEKST" value="+0600"/>
+		<timeZone name="YEKY" value="+0500"/>
+		<timeZone name="Z" value="+0000"/>
+	</dictionary>
+return $result
+};
+
+(:~
+ : Internal auxiliary function that returns an XML representation for a dictionary that contains a 
+ : numeric value associated to different month name abbreviations.
+ :)
+declare %private function normalization:month-dictionary() as element(){
+let $dictionary :=
+<dictionary>
+	<month name="January" value="01">
+		<abrv>Jan</abrv>
+		<abrv>jan</abrv>
+		<abrv>JAN</abrv>
+	</month>
+	<month name="February" value="02">
+		<abrv>Feb</abrv>
+		<abrv>feb</abrv>
+		<abrv>FEB</abrv>
+	</month>
+	<month name="March" value="03">
+		<abrv>Mar</abrv>
+		<abrv>mar</abrv>
+		<abrv>MAR</abrv>
+	</month>
+	<month name="April" value="04">
+		<abrv>Apr</abrv>
+		<abrv>apr</abrv>
+		<abrv>APR</abrv>
+	</month>
+	<month name="May" value="05">
+		<abrv>MAY</abrv>
+		<abrv>may</abrv>
+	</month>
+	<month name="June" value="06">
+		<abrv>Jun</abrv>
+		<abrv>jun</abrv>
+		<abrv>JUN</abrv>
+	</month>
+	<month name="July" value="07">
+		<abrv>Jul</abrv>
+		<abrv>jul</abrv>
+		<abrv>JUL</abrv>
+	</month>
+	<month name="August" value="08">
+		<abrv>aug</abrv>
+		<abrv>Aug</abrv>
+		<abrv>AUG</abrv>
+	</month>
+	<month name="September" value="09">
+		<abrv>sep</abrv>
+		<abrv>Sep</abrv>
+		<abrv>SEP</abrv>
+	</month>
+	<month name="October" value="10">
+		<abrv>oct</abrv>
+		<abrv>OCT</abrv>
+		<abrv>Oct</abrv>
+	</month>
+	<month name="November" value="11">
+		<abrv>nov</abrv>
+		<abrv>Nov</abrv>
+		<abrv>NOV</abrv>
+	</month>
+	<month name="December" value="12">
+		<abrv>dec</abrv>
+		<abrv>Dec</abrv>
+		<abrv>DEC</abrv>
+	</month>
+</dictionary>
+return $dictionary
+};
+
+(:~
+ : Internal auxiliary function that checks if a string is in xs:dateTime format
+ :
+ :
+ : @param $dateTime The string representation for the dateTime.
+ : @return The dateTime string if it represents the xs:dateTime format.
+ :)
+declare %private function normalization:check-dateTime($dateTime as xs:string) as xs:string{
+ concat(string(year-from-dateTime(xs:dateTime($dateTime))), substring($dateTime,5))
+};
+
+(:~
+ : Internal auxiliary function that checks if a string is in xs:date format
+ :
+ :
+ : @param $dateTime The string representation for the date.
+ : @return The date string if it represents the xs:date format.
+ :)
+declare %private function normalization:check-date($date as xs:string) as xs:string{
+ concat(string(year-from-date(xs:date($date))), substring($date,5))
+};
+
+(:~
+ : Internal auxiliary function that checks if a string is in xs:time format
+ :
+ :
+ : @param $dateTime The string representation for the time.
+ : @return The time string if it represents the xs:time format.
+ :)
+declare %private function normalization:check-time($Time as xs:string) as xs:string{
+ if(string(hours-from-time(xs:time($Time))))
+ then $Time
+ else()
+};
+
+

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,117 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides phonetic string similarity functions, comparing strings with basis on how they sound.
+ : 
+ : These metrics are particularly effective in matching names, since names are often spelled in different 
+ : ways that sound the same.
+ : 
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the Soundex key for a given string.
+ :
+ : <br/>
+ : Example usage : <pre> soundex-key("Robert") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> "R163" </pre>
+ :
+ : @param $s1 The string.
+ : @return The Soundex key for the given input string.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq
+ :)
+declare function simp:soundex-key ( $s1 as xs:string ) as xs:string {
+ let $group1 := replace(upper-case(substring($s1,2)),"[BFPV]","1")
+ let $groups := replace(replace(replace(replace(replace(replace($group1,"[CGJKQSXZ]","2"),"[DT]","3"),"L","4"),"[MN]","5"),"R","6"),"[^1-6]","")
+ let $merge := replace($groups,"([1-6])\1","$1")
+ let $result := concat(upper-case(substring($s1,1,1)), $merge)
+ return if (string-length($result) > 4 and matches($result,"([1-6])\1")) 
+        then (simp:soundex-key($result)) 
+        else (substring(concat($result,"0000"),1,4))
+};
+
+(:~
+ : Checks if two strings have the same Soundex key.
+ :
+ : <br/>
+ : Example usage : <pre> soundex( "Robert" , "Rupert" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> true </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return Returns true if both strings have the same Soundex key and false otherwise.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq
+ :)
+declare function simp:soundex ( $s1 as xs:string, $s2 as xs:string ) as xs:boolean {
+ simp:soundex-key($s1) = simp:soundex-key($s2)
+};
+
+(:~
+ : Returns the Metaphone key for a given string.
+ : The Metaphone algorithm produces variable length keys as its output, as opposed to Soundex's fixed-length keys.
+ :
+ : <br/>
+ : Example usage : <pre> metaphone-key("ALEKSANDER") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> "ALKSNTR" </pre>
+ :
+ : @param $s1 The string.
+ : @return The Metaphone key for the given input string.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq
+ :)
+declare function simp:metaphone-key ( $s1 as xs:string ) as xs:string {
+ let $aux1  := replace(upper-case($s1),"([^C])\1","$1")
+ let $aux2  := if (matches($aux1,"$(([KGP]N)|([A]E)|([W]R))")) then (substring($aux1,2,string-length($aux1))) else ($aux1)  
+ let $aux3  := replace(replace($aux2,"MB","M"),"B$","")
+ let $aux4  := replace(replace(replace(replace(replace($aux3,"CIA","XIA"),"SCH","SKH"),"CH","XH"),"C([IEY])","S$1"),"C","K")
+ let $aux5  := replace(replace($aux4,"DG([EYI])","JG$1"),"D","T")
+ let $aux6  := replace(replace($aux5,"GH([^AEIOU])","H$1"),"G(N(ED)?)$","$1")
+ let $aux7  := replace(replace(replace($aux6,"([^G]?)G([IEY])","$1J$2"),"([^G]?)G","$1K"),"GG","G")
+ let $aux8  := replace(replace(replace(replace($aux7,"([AEIOU])H([^AEIOU])","$1$2"),"CK","K"),"PH","F"),"Q","K")
+ let $aux9  := replace(replace(replace(replace(replace($aux8,"S(H|(IO)|(IA))","X$1"),"T((IO)|(IA))","X$1"),"TH","0"),"TCH","CH"),"V","F")
+ let $aux10 := replace(replace(replace(replace(replace(replace($aux9,"$WH","W"),"W([^AEIOU])","$1"),"$X","S"),"X","KS"),"Y([^AEIOU])","$1"),"Z","S")
+ return concat(substring($aux10,1,1) , replace(substring($aux10,2,string-length($aux10)) , "[AEIOU]", ""))
+};
+
+(:~
+ : Checks if two strings have the same Metaphone key.
+ :
+ : <br/>
+ : Example usage : <pre> metaphone("ALEKSANDER", "ALEXANDRE") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> true </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return Returns true if both strings have the same Metaphone key and false otherwise.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq
+ :)
+declare function simp:metaphone ( $s1 as xs:string, $s2 as xs:string ) as xs:boolean {
+ simp:metaphone-key($s1) = simp:metaphone-key($s2)
+};

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,150 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides similarity functions for comparing sets of XML 
+ : nodes (e.g., sets of XML elements, attributes or atomic values).
+ :
+ : These functions are particularly useful for matching near duplicate sets of XML nodes.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the union between two sets, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> deep-union ( ( "a", "b", "c") , ( "a", "a", <d/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a", "b", "c", <d/> ) </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The union of both sets.
+ : @example test/Queries/data-cleaning/set-similarity/deep-union.xq
+ :)
+declare function set:deep-union ( $s1 , $s2 ) {
+ let $s := ( $s1 , $s2 )
+ for $a at $apos in $s
+ where every $ba in subsequence($s, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ return $a
+};
+
+(:~
+ : Returns the intersection between two sets, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> deep-intersect ( ( "a", "b", "c") , ( "a", "a", <d/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The intersection of both sets.
+ : @example test/Queries/data-cleaning/set-similarity/deep-intersect.xq
+ :)
+declare function set:deep-intersect ( $s1 , $s2 ) {
+ for $a at $apos in $s1
+ let $t1 := every $ba in subsequence($s1, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ let $t2 := some $bb in $s2 satisfies deep-equal($bb,$a)
+ where $t1 and $t2
+ return $a
+};
+
+(:~
+ : Removes exact duplicates from a set, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> distinct ( ( "a", "a", <b/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a", <b/> ) </pre>
+ :
+ : @param $s A set.
+ : @return The set provided as input without the exact duplicates (i.e., returns the distinct nodes from the set provided as input).
+ : @example test/Queries/data-cleaning/set-similarity/distinct.xq
+ :)
+declare function set:distinct ( $s ) {
+ for $a at $apos in $s
+ where every $ba in subsequence($s, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ return $a
+};
+
+(:~
+ : Returns the overlap coefficient between two sets of XML nodes.
+ : The overlap coefficient is defined as the shared information between the input sets 
+ : (i.e., the size of the intersection) over the size of the smallest input set.
+ :
+ : <br/>
+ : Example usage : <pre> overlap ( ( "a", "b", <c/> ) , ( "a", "a", "b" ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The overlap coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/overlap.xq
+ :)
+declare function set:overlap ( $s1 , $s2 ) as xs:double {
+  count( set:deep-intersect($s1, $s2) ) div min((count(set:distinct($s1)) , count(set:distinct($s2))))
+};
+
+(:~
+ : Returns the Dice similarity coefficient between two sets of XML nodes.
+ : The Dice coefficient is defined as defined as twice the shared information between the input sets 
+ : (i.e., the size of the intersection) over the sum of the cardinalities for the input sets.
+ :
+ : <br/>
+ : Example usage : <pre> dice ( ( "a", "b", <c/> ) , ( "a", "a", "d") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The Dice similarity coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/dice.xq
+ :)
+declare function set:dice ( $s1 , $s2 ) as xs:double {
+  2 * count( set:deep-intersect($s1,$s2) ) div ( count(set:distinct($s1)) + count(set:distinct($s2)) )
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between two sets of XML nodes.
+ : The Jaccard coefficient is defined as the size of the intersection divided by the size of the 
+ : union of the input sets.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard ( ( "a", "b", <c/> ) , ( "a", "a", "d") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.25 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The Jaccard similarity coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/jaccard.xq
+ :)
+declare function set:jaccard ( $s1 , $s2  ) as xs:double {
+ count( set:deep-intersect($s1,$s2) ) div count( set:deep-union($s1,$s2) )
+};

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,249 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides token-based string similarity functions that view strings 
+ : as sets or multi-sets of tokens and use set-related properties to compute similarity scores. 
+ : The tokens correspond to groups of characters extracted from the strings being compared, such as 
+ : individual words or character n-grams.
+ :
+ : These functions are particularly useful for matching near duplicate strings in cases where
+ : typographical conventions often lead to rearrangement of words (e.g., "John Smith" versus "Smith, John").
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation,
+ : although the module requires the trigonometic functions of XQuery 1.1 or a math extension 
+ : function such as sqrt($x as numeric) for computing the square root.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+(: In the QizX or Saxon XQuery engines, it is possible to call external functions from the Java math library :)
+(: declare namespace math = "java:java.lang.Math"; :)
+declare namespace math = "http://www.w3.org/2005/xpath-functions/math";;
+
+import module namespace set  = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the individual character n-grams forming a string.
+ :
+ : <br/>
+ : Example usage : <pre> ngrams("FLWOR", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("_F" , "FL" , "LW" , "WO" , "LW" , "WO" , "OR" , "R_") </pre>
+ :
+ : @param $s The input string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The sequence of strings with the extracted n-grams.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq
+ :)
+declare function simt:ngrams ( $s as xs:string, $n as xs:integer ) as xs:string* {
+ let $pad := '_'
+ return
+ ( for $a in 1 to $n 
+   let $apad := string-join( for $aux in $a + 1 to $n return $pad , '' ) 
+   return concat( $apad , replace(substring($s,1,$a) , "_", "\\_") ) ,
+
+   for $b in $n + 2 to string-length($s) return replace(substring($s,$b - $n, $n), "_", "\\_") ,
+
+   for $c in string-length($s) - (if ($n = 1) then (-1) else ($n)) - 1 to string-length($s) 
+   let $cpad := string-join( for $aux in string-length($s) - $c + 2 to $n return $pad , '' ) 
+   return concat(replace(substring($s, $c, $n), "_", "\\_"), $cpad ) 
+ )
+};
+
+(:~
+ : Auxiliary function for computing the cosine similarity coefficient between strings, 
+ : using stringdescriptors based on sets of character n-grams or sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> cosine( ("aa","bb") , ("bb","aa")) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $desc1 The descriptor for the first string.
+ : @param $desc2 The descriptor for the second string.
+ : @return The cosine similarity coefficient between the descriptors for the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine.xq
+ :)
+declare function simt:cosine ( $desc1 as xs:string*, $desc2 as xs:string* ) as xs:double {
+ let $vocab     := distinct-values( ($desc1, $desc2) )
+ let $freq1     := for $v1 in $vocab return count($desc1[.=$v1])
+ let $freq2     := for $v2 in $vocab return count($desc2[.=$v2])
+ let $freq1pow  := for $f1 in $freq1 return $f1 * $f1
+ let $freq2pow  := for $f2 in $freq2 return $f2 * $f2 
+ let $mult      := for $freq at $pos in $freq1 return $freq * $freq2[$pos] 
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the Dice similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> dice-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4615384615384616 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The Dice similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq
+ :)
+declare function simt:dice-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:dice(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the overlap similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> overlap-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The overlap similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq
+ :)
+declare function simt:overlap-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:overlap(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.3 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The Jaccard similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq
+ :)
+declare function simt:jaccard-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:jaccard(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of character n-grams extracted from two strings. 
+ : The n-grams from each string are weighted according to their occurence frequency (i.e., weighted according to
+ : the term-frequency heuristic from Information Retrieval).
+ :
+ : <br/>
+ : Example usage : <pre> cosine-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.2401922307076307 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The cosine similarity coefficient between the sets n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq
+ :)
+declare function simt:cosine-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ let $ngrams1   := simt:ngrams($s1,$n) 
+ let $ngrams2   := simt:ngrams($s2,$n)
+ return simt:cosine($ngrams1,$ngrams2)
+};
+
+(:~
+ : Returns the Dice similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> dice-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The Dice similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq
+ :)
+declare function simt:dice-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:dice( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the overlap similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> overlap-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The overlap similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq
+ :)
+declare function simt:overlap-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:overlap( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.25 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The Jaccard similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq
+ :)
+declare function simt:jaccard-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:jaccard( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings. The tokens 
+ : from each string are weighted according to their occurence frequency (i.e., weighted according to the 
+ : term-frequency heuristic from Information Retrieval).
+ :
+ : <br/>
+ : Example usage : <pre> cosine-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.408248290463863 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq
+ :)
+declare function simt:cosine-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $tokens1   := tokenize($s1,$r) 
+ let $tokens2   := tokenize($s2,$r)
+ return simt:cosine($tokens1,$tokens2)
+};

=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd'
--- src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd	1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd	2011-12-22 13:29:42 +0000
@@ -0,0 +1,343 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"; elementFormDefault="qualified" attributeFormDefault="qualified"
+    targetNamespace="http://api.whitepages.com/schema/"; xmlns:wp="http://api.whitepages.com/schema/";>
+<!--
+:: Copyright 2006-2008 The FLWOR Foundation.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+:: http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+::
+-->
+
+  <xs:element name="wp">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element ref="wp:result"/>
+                <xs:element minOccurs="0" ref="wp:errormessages"/>
+                <xs:element minOccurs="0" ref="wp:meta"/>
+                <xs:element minOccurs="0" ref="wp:listings"/>
+                <xs:element minOccurs="0" ref="wp:options"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="result">
+        <xs:complexType>
+            <xs:attribute name="code" use="required" type="wp:responsecode"/>
+            <xs:attribute name="message"/>
+            <xs:attribute name="type" use="required" type="wp:responsetype"/>
+        </xs:complexType>
+    </xs:element>
+    <xs:simpleType name="responsetype">
+        <xs:restriction base="xs:string">
+            <xs:enumeration value="success"/>
+            <xs:enumeration value="error"/>
+        </xs:restriction>
+    </xs:simpleType>
+    <xs:simpleType name="responsecode">
+        <xs:restriction base="xs:string">
+            <xs:enumeration value="Found Data"/>
+            <xs:enumeration value="No Data Found"/>
+            <xs:enumeration value="Truncated Data"/>
+            <xs:enumeration value="Error"/>
+            <xs:enumeration value="Server Error"/>
+            <xs:enumeration value="Invalid Input"/>
+            <xs:enumeration value="Mismatched Input"/>
+            <xs:enumeration value="Missing Input"/>
+            <xs:enumeration value="Refine Input"/>
+        </xs:restriction>
+    </xs:simpleType>
+    <xs:element name="errormessages">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element maxOccurs="unbounded" ref="wp:message"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="message" type="xs:string"/>
+    <xs:element name="meta">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element minOccurs="0" ref="wp:linkexpiration"/>
+                <xs:element ref="wp:recordrange"/>
+                <xs:element ref="wp:apiversion"/>
+                <xs:element ref="wp:searchid"/>
+                <xs:element ref="wp:searchlinks"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="linkexpiration" type="xs:date"/>
+    <xs:element name="recordrange">
+        <xs:complexType>
+            <xs:attribute name="lastrecord" use="required" type="xs:integer"/>
+            <xs:attribute name="firstrecord" use="required" type="xs:integer"/>
+            <xs:attribute name="totalavailable" use="required" type="xs:integer"/>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="apiversion" type="xs:string"/>
+    <xs:element name="searchid" type="xs:string"/>
+    <xs:element name="searchlinks">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element maxOccurs="unbounded" ref="wp:link"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="listings">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:listing"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="listing">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:people"/>
+                <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:business"/>
+                <xs:element minOccurs="0" ref="wp:displayname"/>
+                <xs:element minOccurs="0" ref="wp:tagline"/>
+                <xs:element minOccurs="0" ref="wp:phonenumbers"/>
+                <xs:element minOccurs="0" ref="wp:address"/>
+                <xs:element minOccurs="0" ref="wp:geodata"/>
+                <xs:element minOccurs="0" ref="wp:listingmeta"/>
+            </xs:sequence>
+            <xs:attribute name="sponsored" type="xs:boolean"/>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="people">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element maxOccurs="unbounded" ref="wp:person"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="person" type="wp:personType"/>
+    <xs:complexType name="personType">
+        <xs:sequence>
+            <xs:element ref="wp:firstname"/>
+            <xs:element minOccurs="0" ref="wp:middlename"/>
+            <xs:element ref="wp:lastname"/>
+            <xs:element minOccurs="0" ref="wp:suffix"/>
+        </xs:sequence>
+        <xs:attribute name="rank" use="required" type="wp:rank"/>
+    </xs:complexType>
+
+    <xs:simpleType name="rank">
+        <xs:restriction base="xs:string">
+            <xs:enumeration value="primary"/>
+            <xs:enumeration value="secondary"/>
+            <xs:enumeration value="tertiary"/>
+        </xs:restriction>
+    </xs:simpleType>
+
+    <xs:element name="firstname" type="xs:string"/>
+    <xs:element name="middlename" type="xs:string"/>
+    <xs:element name="lastname" type="xs:string"/>
+    <xs:element name="suffix" type="xs:string"/>
+    <xs:element name="business">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element ref="wp:businessname"/>
+            </xs:sequence>
+            <xs:attribute name="rank" use="required" type="wp:rank"/>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="businessname" type="xs:string"/>
+    <xs:element name="displayname" type="xs:string"/>
+    <xs:element name="tagline" type="xs:string"/>
+    <xs:element name="phonenumbers">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element maxOccurs="unbounded" ref="wp:phone"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="phone">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element ref="wp:fullphone"/>
+                <xs:element minOccurs="0" ref="wp:areacode"/>
+                <xs:element ref="wp:exchange"/>
+                <xs:element ref="wp:linenumber"/>
+                <xs:element minOccurs="0" ref="wp:carrier"/>
+            </xs:sequence>
+            <xs:attribute name="rank" use="required" type="wp:rank"/>
+            <xs:attribute name="type" use="required" type="wp:listingtype"/>
+            <xs:attribute name="carrier_only"        type="xs:boolean"/>
+        </xs:complexType>
+    </xs:element>
+
+    <xs:simpleType name="listingtype">
+        <xs:restriction base="xs:string">
+            <xs:enumeration value="work"/>
+            <xs:enumeration value="home"/>
+            <xs:enumeration value="business"/>
+            <xs:enumeration value="government"/>
+            <xs:enumeration value="mobile"/>
+            <xs:enumeration value="landline"/>
+            <xs:enumeration value="pager"/>
+            <xs:enumeration value="satellite"/>
+            <xs:enumeration value="unknown"/>
+        </xs:restriction>
+    </xs:simpleType>
+
+    <xs:element name="fullphone" type="xs:string"/>
+    <xs:element name="areacode" type="xs:string"/>
+    <xs:element name="exchange" type="xs:string"/>
+    <xs:element name="linenumber" type="xs:string"/>
+    <xs:element name="carrier" type="xs:string"/>
+    <xs:element name="address">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element minOccurs="0" ref="wp:fullstreet"/>
+                <xs:element minOccurs="0" ref="wp:house"/>
+                <xs:element minOccurs="0" ref="wp:predir"/>
+                <xs:element minOccurs="0" ref="wp:street"/>
+                <xs:element minOccurs="0" ref="wp:postdir"/>
+                <xs:element minOccurs="0" ref="wp:streettype"/>
+                <xs:element minOccurs="0" ref="wp:aptnumber"/>
+                <xs:element minOccurs="0" ref="wp:apttype"/>
+                <xs:element minOccurs="0" ref="wp:city"/>
+                <xs:element minOccurs="0" ref="wp:state"/>
+                <xs:element minOccurs="0" ref="wp:zip"/>
+                <xs:element minOccurs="0" ref="wp:zip4"/>
+                <xs:element minOccurs="0" ref="wp:country"/>
+            </xs:sequence>
+            <xs:attribute name="deliverable" use="required" type="xs:boolean"/>
+        </xs:complexType>
+    </xs:element>
+
+    <xs:simpleType name="country">
+        <xs:restriction base="xs:string">
+            <xs:enumeration value="US"/>
+            <xs:enumeration value="CA"/>
+        </xs:restriction>
+    </xs:simpleType>
+
+    <xs:element name="fullstreet" type="xs:string"/>
+    <xs:element name="house" type="xs:string"/>
+    <xs:element name="predir" type="xs:string"/>
+    <xs:element name="street" type="xs:string"/>
+    <xs:element name="postdir" type="xs:string"/>
+    <xs:element name="streettype" type="xs:string"/>
+    <xs:element name="aptnumber" type="xs:string"/>
+    <xs:element name="apttype" type="xs:string"/>
+    <xs:element name="city" type="xs:string"/>
+    <xs:element name="state" type="xs:string"/>
+    <xs:element name="zip" type="xs:string"/>
+    <xs:element name="zip4" type="xs:string"/>
+    <xs:element name="country" type="wp:country"/>
+    <xs:element name="geodata">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element ref="wp:geoprecision"/>
+                <xs:element ref="wp:latitude"/>
+                <xs:element ref="wp:longitude"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="geoprecision" type="xs:integer"/>
+    <xs:element name="latitude" type="xs:string"/>
+    <xs:element name="longitude" type="xs:string"/>
+    <xs:element name="previous_locations">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element name="previous_location" maxOccurs="unbounded" type="wp:locationType"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:complexType name="locationType">
+        <xs:sequence>
+            <xs:element name="city" type="xs:string"/>
+            <xs:element name="state" type="xs:string"/>
+            <xs:element name="year" type="xs:string"/>
+        </xs:sequence>
+    </xs:complexType>
+    <xs:element name="listingmeta">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element minOccurs="0" ref="wp:lastvalidated"/>
+                <xs:element minOccurs="0" ref="wp:type"/>
+                <xs:element minOccurs="0" ref="wp:sponsor"/>
+                <xs:element minOccurs="0" ref="wp:recordid"/>
+                <xs:element ref="wp:moreinfolinks"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="lastvalidated" type="xs:string"/>
+    <xs:element name="sponsor" type="xs:string"/>
+    <xs:element name="recordid" type="xs:string"/>
+    <xs:element name="type" type="wp:listingtype"/>
+    <xs:element name="moreinfolinks">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:link"/>
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="options">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element maxOccurs="unbounded" ref="wp:cityoption" minOccurs="0" />
+                <xs:element maxOccurs="unbounded" ref="wp:categoryoption" minOccurs="0" />
+            </xs:sequence>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="cityoption">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element ref="wp:refinesearchurl"/>
+            </xs:sequence>
+            <xs:attribute name="city" use="required" type="xs:string"/>
+            <xs:attribute name="country" use="required" type="wp:country"/>
+            <xs:attribute name="state" use="required" type="xs:string"/>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="categoryoption">
+        <xs:complexType>
+            <xs:sequence>
+                <xs:element ref="wp:refinesearchurl"/>
+            </xs:sequence>
+            <xs:attribute name="total" use="required" type="xs:string"/>
+            <xs:attribute name="description" use="required" type="xs:string"/>
+        </xs:complexType>
+    </xs:element>
+    <xs:element name="refinesearchurl" type="xs:anyURI"/>
+    <xs:element name="link">
+        <xs:complexType>
+            <xs:simpleContent>
+                <xs:extension base="xs:anyURI">
+                    <xs:attribute name="linktext" use="required" type="xs:string"/>
+                    <xs:attribute name="type" use="required" type="wp:linktype"/>
+                </xs:extension>
+            </xs:simpleContent>
+        </xs:complexType>
+    </xs:element>
+    <xs:simpleType name="linktype">
+        <xs:restriction base="xs:string">
+            <xs:enumeration value="drivingdirections"/>
+            <xs:enumeration value="findneighbors"/>
+            <xs:enumeration value="homepage"/>
+            <xs:enumeration value="viewdetails"/>
+            <xs:enumeration value="viewmap"/>
+
+            <xs:enumeration value="mapareacode"/>
+
+            <xs:enumeration value="allresults"/>
+            <xs:enumeration value="mapallresults"/>
+            <xs:enumeration value="self"/>
+            <xs:enumeration value="worklistings"/>
+
+            <xs:enumeration value="viewsearchsuggestions"/>
+        </xs:restriction>
+    </xs:simpleType>
+</xs:schema>
\ No newline at end of file

=== added directory 'test'
=== renamed directory 'test' => 'test.moved'
=== added directory 'test/ExpQueryResults'
=== added directory 'test/ExpQueryResults/data-cleaning'
=== added directory 'test/ExpQueryResults/data-cleaning/character-based-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+2

=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.8577777777777778

=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5853174603174604

=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0

=== added directory 'test/ExpQueryResults/data-cleaning/consolidation'
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<c/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<c/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+eeefff
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+b
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+aaa
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a A b c AAA d
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a att1="a1" att2="a2"/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a att1="a1" att2="a2" att3="a3"/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a><b/><c/><d/></a>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+aaabbb
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a b c
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+aaa bbb
\ No newline at end of file

=== added directory 'test/ExpQueryResults/data-cleaning/conversion'
=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Portugal Lisbon praça Marquês de Pombal
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+4610 Harrison Bend Rd, Loudon, TN, US
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+222 E 53rd St, Los Angeles, CA, US

=== added file 'test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.747887218607434
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+38 -10

=== added file 'test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+(520) 824-3160 (520) 824-3160
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/conversion/phone-from-user.xml.res'
=== added file 'test/ExpQueryResults/data-cleaning/conversion/unit-convert.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/unit-convert.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/unit-convert.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1.609344
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Sunizona Greenhouses Inc Stan Smith
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Gene Simpson Homer V Simpson Homer Simpson Sue M Simpson

=== added directory 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.907838383838384
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file

=== added directory 'test/ExpQueryResults/data-cleaning/normalization'
=== added file 'test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Portugal Lisbon Marquês de Pombal

=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-date.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/to-date.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/to-date.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+2002-10-24

=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-dateTime.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/to-dateTime.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/to-dateTime.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+2002-10-24T21:22:00

=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-time.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/to-time.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/to-time.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+09:10:00

=== added directory 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+ALKSNTR
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+ALKSNTR
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+R163
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+true
\ No newline at end of file

=== added directory 'test/ExpQueryResults/data-cleaning/set-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a b c<d/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.4
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a<b/>
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.25
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file

=== added directory 'test/ExpQueryResults/data-cleaning/token-based-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.2401922307076307
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.408248290463863
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.4615384615384616
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.4
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.3
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.25
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+_F FL LW WO LW WO OR R_
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file

=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res	1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file

=== added directory 'test/Queries'
=== added directory 'test/Queries/data-cleaning'
=== added directory 'test/Queries/data-cleaning/character-based-string-similarity'
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+simc:edit-distance("FLWOR", "FLOWER")

=== added file 'test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+simc:jaro-winkler("DWAYNE", "DUANE", 4, 0.1 )

=== added file 'test/Queries/data-cleaning/character-based-string-similarity/jaro.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/jaro.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/jaro.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+simc:jaro("FLWOR Found.", "FLWOR Foundation")

=== added file 'test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+simc:needleman-wunsch("KAK", "KQRK", 1, 1)

=== added directory 'test/Queries/data-cleaning/consolidation'
=== added file 'test/Queries/data-cleaning/consolidation/least-attributes.xq'
--- test/Queries/data-cleaning/consolidation/least-attributes.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-attributes.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )

=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-distinct-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )

=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-elements.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-elements.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-elements.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-distinct-elements( ( <a><b/></a>, <b><c/></b>, <d/>) )

=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-distinct-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) )

=== added file 'test/Queries/data-cleaning/consolidation/least-elements.xq'
--- test/Queries/data-cleaning/consolidation/least-elements.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-elements.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-elements( ( <a><b/></a>, <b><c/></b>, <d/>) )

=== added file 'test/Queries/data-cleaning/consolidation/least-nodes.xq'
--- test/Queries/data-cleaning/consolidation/least-nodes.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-nodes.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) )

=== added file 'test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq'
--- test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" )

=== added file 'test/Queries/data-cleaning/consolidation/least-tokens.xq'
--- test/Queries/data-cleaning/consolidation/least-tokens.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-tokens.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-tokens( ( "a b c", "a b", "a"), " +" )

=== added file 'test/Queries/data-cleaning/consolidation/leastfrequent_1.xq'
--- test/Queries/data-cleaning/consolidation/leastfrequent_1.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/leastfrequent_1.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-frequent( ( "a", "a", "b") )

=== added file 'test/Queries/data-cleaning/consolidation/longest_1.xq'
--- test/Queries/data-cleaning/consolidation/longest_1.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/longest_1.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:longest( ( "a", "aa", "aaa") )

=== added file 'test/Queries/data-cleaning/consolidation/matching_1.xq'
--- test/Queries/data-cleaning/consolidation/matching_1.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/matching_1.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:matching( ( "a A b", "c AAA d", "e BB f"), "A+" )

=== added file 'test/Queries/data-cleaning/consolidation/most-attributes.xq'
--- test/Queries/data-cleaning/consolidation/most-attributes.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-attributes.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )

=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-distinct-attributes( ( <a att1="a1" att2="a2" att3="a3"/>, <a att1="a1" att2="a2"><b att2="a2" /></a>, <c/> ) )

=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-elements.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-elements.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-elements.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-distinct-elements( ( <a><b/><c/><d/></a>, <a><b/><b/><c/></a>, <a/> ) )

=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-distinct-nodes( ( <a><b/></a>, <a><a/></a>, <b/>) )

=== added file 'test/Queries/data-cleaning/consolidation/most-elements.xq'
--- test/Queries/data-cleaning/consolidation/most-elements.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-elements.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-elements( ( <a><b/></a>, <a/>, <b/>) )

=== added file 'test/Queries/data-cleaning/consolidation/most-frequent.xq'
--- test/Queries/data-cleaning/consolidation/most-frequent.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-frequent.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-frequent( ( "a", "a", "b") )

=== added file 'test/Queries/data-cleaning/consolidation/most-nodes.xq'
--- test/Queries/data-cleaning/consolidation/most-nodes.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-nodes.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-nodes( ( <a><b/></a>, <a/>, <b/>) )

=== added file 'test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq'
--- test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" )

=== added file 'test/Queries/data-cleaning/consolidation/most-tokens.xq'
--- test/Queries/data-cleaning/consolidation/most-tokens.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-tokens.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-tokens( ( "a b c", "a b", "a"), " +" )

=== added file 'test/Queries/data-cleaning/consolidation/shortest_1.xq'
--- test/Queries/data-cleaning/consolidation/shortest_1.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/shortest_1.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:shortest( ( "a", "aa", "aaa") )

=== added file 'test/Queries/data-cleaning/consolidation/superstring_1.xq'
--- test/Queries/data-cleaning/consolidation/superstring_1.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/superstring_1.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:superstring( ( "aaa bbb ccc", "aaa bbb", "aaa ddd", "eee fff" ) )

=== added directory 'test/Queries/data-cleaning/conversion'
=== added file 'test/Queries/data-cleaning/conversion/address-from-geocode.xq'
--- test/Queries/data-cleaning/conversion/address-from-geocode.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-geocode.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:address-from-geocode ( 38.725735 , -9.15021 )

=== added file 'test/Queries/data-cleaning/conversion/address-from-phone.xq'
--- test/Queries/data-cleaning/conversion/address-from-phone.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-phone.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:address-from-phone ('8654582358')

=== added file 'test/Queries/data-cleaning/conversion/address-from-user.xq'
--- test/Queries/data-cleaning/conversion/address-from-user.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-user.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:address-from-user ('Maria Lurdes')

=== added file 'test/Queries/data-cleaning/conversion/currency-convert.xq'
--- test/Queries/data-cleaning/conversion/currency-convert.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/currency-convert.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:currency-convert ( 1, "USD", "EUR", "2011-01-18" )

=== added file 'test/Queries/data-cleaning/conversion/geocode-from-address.xq'
--- test/Queries/data-cleaning/conversion/geocode-from-address.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/geocode-from-address.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,5 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+let $geocode := conversion:geocode-from-address ( ("Lisboa", "Portugal") )
+for $result in $geocode
+return floor($result)

=== added file 'test/Queries/data-cleaning/conversion/phone-from-address.xq'
--- test/Queries/data-cleaning/conversion/phone-from-address.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/phone-from-address.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:phone-from-address('5655 E Gaskill Rd, Willcox, AZ, US')

=== added file 'test/Queries/data-cleaning/conversion/phone-from-user.xq'
--- test/Queries/data-cleaning/conversion/phone-from-user.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/phone-from-user.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:phone-from-user ('Maria Lurdes')

=== added file 'test/Queries/data-cleaning/conversion/unit-convert.xq'
--- test/Queries/data-cleaning/conversion/unit-convert.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/unit-convert.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:unit-convert ( 1 , "Distance", "mile", "kilometer" )

=== added file 'test/Queries/data-cleaning/conversion/user-from-address.xq'
--- test/Queries/data-cleaning/conversion/user-from-address.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/user-from-address.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:user-from-address('5655 E Gaskill Rd, Willcox, AZ, US')

=== added file 'test/Queries/data-cleaning/conversion/user-from-phone.xq'
--- test/Queries/data-cleaning/conversion/user-from-phone.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/user-from-phone.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:user-from-phone ('8654582358')

=== added directory 'test/Queries/data-cleaning/hybrid-string-similarity'
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:monge-elkan-jaro-winkler("Comput. Sci. and Eng. Dept., University of California, San Diego", "Department of Computer Scinece, Univ. Calif., San Diego", 4, 0.1)

=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-edit-distance("The FLWOR Foundation", "FLWOR Found.", " +", 0 )

=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-jaro-winkler("The FLWOR Foundation", "FLWOR Found.", " +", 1, 4, 0.1 )

=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-jaro("The FLWOR Foundation", "FLWOR Found.", " +", 1 )

=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-metaphone("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +" )

=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-soundex("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +")

=== added directory 'test/Queries/data-cleaning/normalization'
=== added file 'test/Queries/data-cleaning/normalization/normalize-address.xq'
--- test/Queries/data-cleaning/normalization/normalize-address.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/normalize-address.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+normalization:normalize-address ( ( 'Marques de Pombal' , 'Lisboa' ) )

=== added file 'test/Queries/data-cleaning/normalization/to-date.xq'
--- test/Queries/data-cleaning/normalization/to-date.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-date.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+normalization:to-date ( "24OCT2002" , "%d%b%Y" )

=== added file 'test/Queries/data-cleaning/normalization/to-dateTime.spec'
--- test/Queries/data-cleaning/normalization/to-dateTime.spec	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-dateTime.spec	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Error: http://www.zorba-xquery.com/modules/data-cleaning/normalization:notsupported

=== added file 'test/Queries/data-cleaning/normalization/to-dateTime.xq'
--- test/Queries/data-cleaning/normalization/to-dateTime.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-dateTime.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+normalization:to-dateTime( "24OCT2002 21:22" , "%d%b%Y %H%M" )

=== added file 'test/Queries/data-cleaning/normalization/to-time.spec'
--- test/Queries/data-cleaning/normalization/to-time.spec	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-time.spec	2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Error: http://www.zorba-xquery.com/modules/data-cleaning/normalization:notsupported

=== added file 'test/Queries/data-cleaning/normalization/to-time.xq'
--- test/Queries/data-cleaning/normalization/to-time.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-time.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+normalization:to-time ( "09 hours 10 minutes" , "%H hours %M minutes" )

=== added directory 'test/Queries/data-cleaning/phonetic-string-similarity'
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+simp:metaphone-key("ALEKSANDER")

=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+simp:metaphone-key("ALEKSANDER")

=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+simp:soundex-key("Robert")

=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+simp:soundex( "Robert" , "Rupert" )

=== added directory 'test/Queries/data-cleaning/set-similarity'
=== added file 'test/Queries/data-cleaning/set-similarity/deep-intersect.xq'
--- test/Queries/data-cleaning/set-similarity/deep-intersect.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/deep-intersect.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:deep-intersect ( ( "a", "b", "c") , ( "a", "a", <d/> ) )

=== added file 'test/Queries/data-cleaning/set-similarity/deep-union.xq'
--- test/Queries/data-cleaning/set-similarity/deep-union.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/deep-union.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:deep-union ( ( "a", "b", "c") , ( "a", "a", <d/> ) )

=== added file 'test/Queries/data-cleaning/set-similarity/dice.xq'
--- test/Queries/data-cleaning/set-similarity/dice.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/dice.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:dice ( ( "a", "b", <c/> ) , ( "a", "a", "d") )

=== added file 'test/Queries/data-cleaning/set-similarity/distinct.xq'
--- test/Queries/data-cleaning/set-similarity/distinct.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/distinct.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:distinct (( "a", "a", <b/> ))

=== added file 'test/Queries/data-cleaning/set-similarity/jaccard.xq'
--- test/Queries/data-cleaning/set-similarity/jaccard.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/jaccard.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:jaccard ( ( "a", "b", <c/> ) , ( "a", "a", "d") )

=== added file 'test/Queries/data-cleaning/set-similarity/overlap.xq'
--- test/Queries/data-cleaning/set-similarity/overlap.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/overlap.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:overlap ( ( "a", "b", <c/> ) , ( "a", "a", "b" ) )

=== added directory 'test/Queries/data-cleaning/token-based-string-similarity'
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:cosine-ngrams("DWAYNE", "DUANE", 2 )

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:cosine-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:cosine( ("aa","bb") , ("bb","aa"))

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:dice-ngrams("DWAYNE", "DUANE", 2 )

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:dice-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:jaccard-ngrams("DWAYNE", "DUANE", 2 )

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:jaccard-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:ngrams("FLWOR", 2 )

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:overlap-ngrams("DWAYNE", "DUANE", 2 )

=== added file 'test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq	1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq	2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:overlap-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )