zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #01240
[Merge] lp:~matthias-brantner/zorba/data-cleaning into lp:zorba
Matthias Brantner has proposed merging lp:~matthias-brantner/zorba/data-cleaning into lp:zorba.
Requested reviews:
Diogo Simões (diogo-simoes89)
For more details, see:
https://code.launchpad.net/~matthias-brantner/zorba/data-cleaning/+merge/79758
reenabled the smith-waterman
The function finally terminates and is much faster than before (with the latest version of Zorba).
--
https://code.launchpad.net/~matthias-brantner/zorba/data-cleaning/+merge/79758
Your team Zorba Coders is subscribed to branch lp:zorba.
=== added file 'CMakeLists.txt'
--- CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ CMakeLists.txt 2011-10-19 02:07:27 +0000
@@ -0,0 +1,34 @@
+# Copyright 2006-2010 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+
+PROJECT (zorba_data-cleaning_module)
+ENABLE_TESTING ()
+INCLUDE (CTest)
+
+LIST (APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules")
+
+FIND_PACKAGE (Zorba REQUIRED HINTS "${ZORBA_BUILD_DIR}")
+INCLUDE ("${Zorba_USE_FILE}")
+
+ADD_TEST_DIRECTORY("${PROJECT_SOURCE_DIR}/test" "${EXCEPTION_LIST}")
+
+EXPECTED_FAILURE(zorba_data-cleaning_module/data-cleaning/conversion/address-from-user.xq 871051)
+EXPECTED_FAILURE(zorba_data-cleaning_module/data-cleaning/conversion/phone-from-user.xq 871051)
+EXPECTED_FAILURE(zorba_data-cleaning_module/data-cleaning/conversion/user-from-phone.xq 871051)
+
+ADD_SUBDIRECTORY("src")
+
+DONE_DECLARING_ZORBA_URIS()
=== renamed file 'CMakeLists.txt' => 'CMakeLists.txt.moved'
=== added directory 'cmake_modules'
=== renamed directory 'cmake_modules' => 'cmake_modules.moved'
=== added file 'cmake_modules/CMakeCompareVersionStrings.cmake'
--- cmake_modules/CMakeCompareVersionStrings.cmake 1970-01-01 00:00:00 +0000
+++ cmake_modules/CMakeCompareVersionStrings.cmake 2011-10-19 02:07:27 +0000
@@ -0,0 +1,84 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Computes the realtionship between two version strings. A version
+# string is a number delineated by '.'s such as 1.3.2 and 0.99.9.1.
+# You can feed version strings with different number of dot versions,
+# and the shorter version number will be padded with zeros: 9.2 <
+# 9.2.1 will actually compare 9.2.0 < 9.2.1.
+#
+# Input: a_in - value, not variable
+# b_in - value, not variable
+# result_out - variable with value:
+# -1 : a_in < b_in
+# 0 : a_in == b_in
+# 1 : a_in > b_in
+#
+# Written by James Bigler.
+MACRO(COMPARE_VERSION_STRINGS a_in b_in result_out)
+ # Since SEPARATE_ARGUMENTS using ' ' as the separation token,
+ # replace '.' with ' ' to allow easy tokenization of the string.
+ STRING(REPLACE "." " " a "${a_in}")
+ STRING(REPLACE "." " " b "${b_in}")
+ SEPARATE_ARGUMENTS(a)
+ SEPARATE_ARGUMENTS(b)
+
+ # Check the size of each list to see if they are equal.
+ LIST(LENGTH a a_length)
+ LIST(LENGTH b b_length)
+
+ # Pad the shorter list with zeros.
+
+ # Note that range needs to be one less than the length as the for
+ # loop is inclusive (silly CMake).
+ IF(a_length LESS b_length)
+ # a is shorter
+ SET(shorter a)
+ MATH(EXPR range "${b_length} - 1")
+ MATH(EXPR pad_range "${b_length} - ${a_length} - 1")
+ ELSE(a_length LESS b_length)
+ # b is shorter
+ SET(shorter b)
+ MATH(EXPR range "${a_length} - 1")
+ MATH(EXPR pad_range "${a_length} - ${b_length} - 1")
+ ENDIF(a_length LESS b_length)
+
+ # PAD out if we need to
+ IF(NOT pad_range LESS 0)
+ FOREACH(pad RANGE ${pad_range})
+ # Since shorter is an alias for b, we need to get to it by by dereferencing shorter.
+ LIST(APPEND ${shorter} 0)
+ ENDFOREACH(pad RANGE ${pad_range})
+ ENDIF(NOT pad_range LESS 0)
+
+ SET(result 0)
+ FOREACH(index RANGE ${range})
+ IF(result EQUAL 0)
+ # Only continue to compare things as long as they are equal
+ LIST(GET a ${index} a_version)
+ LIST(GET b ${index} b_version)
+ # LESS
+ IF(a_version LESS b_version)
+ SET(result -1)
+ ENDIF(a_version LESS b_version)
+ # GREATER
+ IF(a_version GREATER b_version)
+ SET(result 1)
+ ENDIF(a_version GREATER b_version)
+ ENDIF(result EQUAL 0)
+ ENDFOREACH(index)
+
+ # Copy out the return result
+ SET(${result_out} ${result})
+ENDMACRO(COMPARE_VERSION_STRINGS)
=== added directory 'src'
=== renamed directory 'src' => 'src.moved'
=== added file 'src/CMakeLists.txt'
--- src/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/CMakeLists.txt 2011-10-19 02:07:27 +0000
@@ -0,0 +1,20 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+MESSAGE(STATUS "Add com")
+ADD_SUBDIRECTORY(com)
+
+MESSAGE(STATUS "End modules")
=== added directory 'src/com'
=== added file 'src/com/CMakeLists.txt'
--- src/com/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/CMakeLists.txt 2011-10-19 02:07:27 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(zorba-xquery)
=== added directory 'src/com/zorba-xquery'
=== added file 'src/com/zorba-xquery/CMakeLists.txt'
--- src/com/zorba-xquery/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/CMakeLists.txt 2011-10-19 02:07:27 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(www)
=== added directory 'src/com/zorba-xquery/www'
=== added file 'src/com/zorba-xquery/www/CMakeLists.txt'
--- src/com/zorba-xquery/www/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/CMakeLists.txt 2011-10-19 02:07:27 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(modules)
=== added directory 'src/com/zorba-xquery/www/modules'
=== added file 'src/com/zorba-xquery/www/modules/CMakeLists.txt'
--- src/com/zorba-xquery/www/modules/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/CMakeLists.txt 2011-10-19 02:07:27 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(data-cleaning)
=== added directory 'src/com/zorba-xquery/www/modules/data-cleaning'
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt'
--- src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt 2011-10-19 02:07:27 +0000
@@ -0,0 +1,40 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+ VERSION 2.0 FILE "character-based-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+ VERSION 2.0 FILE "consolidation.xq")
+
+DECLARE_ZORBA_SCHEMA( FILE whitepages_schema.xsd
+ URI "http://api.whitepages.com/schema/";)
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+ VERSION 2.0 FILE "conversion.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+ VERSION 2.0 FILE "hybrid-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/normalization";
+ VERSION 2.0 FILE "normalization.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";
+ VERSION 2.0 FILE "phonetic-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+ VERSION 2.0 FILE "set-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+ VERSION 2.0 FILE "token-based-string-similarity.xq")
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,177 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides character-based string similarity functions
+ : that view strings as sequences of characters, generally computing a similarity score
+ : that corresponds to the cost of transforming one string into another.
+ :
+ : These functions are particularly useful for matching near duplicate strings
+ : in the presence of typographical errors.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the edit distance between two strings.
+ :
+ : This distance, also refered to as the Levenshtein distance, is defined as the minimum number
+ : of edits needed to transform one string into the other, with the allowable edit operations
+ : being insertion, deletion, or substitution of a single character.
+ :
+ : <br/>
+ : Example usage : <pre> edit-distance("FLWOR", "FLOWER") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 2 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return The edit distance between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq
+ :)
+declare function simc:edit-distance ( $s1 as xs:string, $s2 as xs:string ) as xs:integer {
+ if(string-length($s1) = 0) then string-length($s2) else
+ if(string-length($s2) = 0) then string-length($s1) else
+ min((
+ simc:edit-distance(substring($s1, 2), $s2) + 1 ,
+ simc:edit-distance($s1, substring($s2, 2)) + 1 ,
+ simc:edit-distance(substring($s1, 2), substring($s2, 2)) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then 0 else 1 )
+ ))
+};
+
+(:~
+ : Returns the Jaro similarity coefficient between two strings.
+ :
+ : This similarity coefficient is based on the number of transposed characters and on a
+ : weighted sum of the percentage of matched characters held within the strings. The higher
+ : the Jaro-Winkler value is, the more similar the strings are. The coefficient is
+ : normalized such that 0 equates to no similarity and 1 is an exact match.
+ :
+ : <br/>
+ : Example usage : <pre> jaro("FLWOR Found.", "FLWOR Foundation") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5853174603174603 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return The Jaro similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/jaro.xq
+ :)
+declare function simc:jaro ( $s1 as xs:string, $s2 as xs:string ) as xs:double {
+ let $s := for $i in ($s1,$s2) order by string-length($i) return $i
+ let $l1 := string-length($s[1])
+ let $l2 := string-length($s[2])
+ let $mt := xs:integer((max(($l1,$l2)) div 2.0) - 1)
+ let $mc := for $i in 1 to min( ($l1 , $l2) )
+ let $auxmatch := substring($s[2], max((1,$i - $mt)), $mt * 2 )
+ return for $j in 1 to string-length($auxmatch)
+ where substring($auxmatch, $j, 1) = substring($s[1], $i, 1)
+ return <match char="{substring($s[1], $i, 1)}" pos1="{$i}" pos2="{$j + max((1,$i - $mt)) - 1}" />
+ let $m := if (count($mc) = 0) then (1) else (count($mc))
+ let $t := count( for $i in $mc, $j in $mc where $i/@pos1>$j/@pos1 and $i/@pos2<$j/@pos2 return $i )
+ let $dist := xs:double((($m div $l1) + ($m div $l2) + (($m - $t) div $m)) div 3)
+ return $dist
+};
+
+(:~
+ : Returns the Jaro-Winkler similarity coefficient between two strings.
+ :
+ : This similarity coefficient corresponds to an extension of the Jaro similarity coefficient that weights or
+ : penalizes strings based on their similarity at the beginning of the string, up to a given prefix size.
+ :
+ : <br/>
+ : Example usage : <pre> jaro-winkler("DWAYNE", "DUANE", 4, 0.1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.8577777777777778 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $prefix The number of characters to consider when testing for equal prefixes in the strings.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes.
+ : @return The Jaro-Winkler similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq
+ :)
+declare function simc:jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $prefix as xs:integer, $fact as xs:double ) as xs:double {
+ let $jaro := simc:jaro( $s1 , $s2 )
+ let $cc := for $i in 1 to min(($prefix, string-length($s1), string-length($s2)))
+ where substring($s1, 0, $i) = substring($s2, 0, $i) return $i
+ return ($jaro + ( $fact * max($cc) * ( 1 - $jaro ) ) )
+};
+
+(:~
+ : Returns the Needleman-Wunsch distance between two strings.
+ :
+ : The Needleman-Wunsch distance is similar to the basic edit distance metric, adding a
+ : variable cost adjustment to the cost of a gap (i.e., an insertion or deletion) in the
+ : distance metric.
+ :
+ : <br/>
+ : Example usage : <pre> needleman-wunsch("KAK", "KQRK", 1, 1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $score The score value.
+ : @param $penalty The penalty value.
+ : @return The Needleman-Wunsch distance between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq
+ :)
+declare function simc:needleman-wunsch ( $s1 as xs:string, $s2 as xs:string, $score as xs:integer, $penalty as xs:integer ) as xs:double{
+
+ if(string-length($s1) = 0) then string-length($s2)* - $penalty else
+ if(string-length($s2) = 0) then string-length($s1)* - $penalty else
+ max((
+ simc:needleman-wunsch(substring($s1, 2), $s2, $score, $penalty) - $penalty ,
+ simc:needleman-wunsch($s1, substring($s2, 2), $score, $penalty) - $penalty ,
+ simc:needleman-wunsch(substring($s1, 2), substring($s2, 2), $score, $penalty) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then $score else -$penalty )
+ ))
+};
+
+(:~
+ : Returns the Smith-Waterman distance between two strings.
+ :
+ : <br/>
+ : Example usage : <pre> smith-waterman("ACACACTA", "AGCACACA", 2, 1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 12 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $score The score value.
+ : @param $penalty The penalty value.
+ : @return The Smith-Waterman distance between the two strings.
+ :)
+declare function simc:smith-waterman ( $s1 as xs:string, $s2 as xs:string, $score as xs:integer, $penalty as xs:integer ) as xs:double{
+ if(string-length($s1) = 0) then 0 else
+ if(string-length($s2) = 0) then 0 else
+ max((
+ 0,
+ simc:smith-waterman(substring($s1, 2), $s2, $score, $penalty) - $penalty ,
+ simc:smith-waterman($s1, substring($s2, 2), $score, $penalty) - $penalty ,
+ simc:smith-waterman(substring($s1, 2), substring($s2, 2), $score, $penalty) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then $score else -$penalty )
+ ))
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,579 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data consolidation functions that generally take as input a sequence of XML nodes
+ : and apply some rule in order do decide which node is better suited to represent the entire sequence.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation,
+ : although the consolidation functions based on matching sequences against XPath expressions require
+ : some form of dynamic evaluation for XPath expressions,
+ : such as the x:eval() function provided in the Qizx XQuery Engine.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the single most frequent node in a sequence of nodes provided as input.
+ : If more then one answer is possible, returns the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-frequent( ( "a", "a", "b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The most frequent node in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-frequent.xq
+ :)
+declare function con:most-frequent ( $s ) {
+ (for $str in set:distinct($s) order by count($s[deep-equal(.,$str)]) descending return $str)[1]
+};
+
+(:~
+ : Returns the single less frequent node in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-frequent( ( "a", "a", "b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("b") </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The least frequent node in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/leastfrequent_1.xq
+ :)
+declare function con:least-frequent ( $s ) {
+ let $aux := for $str in set:distinct($s) order by count($s[deep-equal(.,$str)]) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single longest string, in terms of the number of characters, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> con:longest( ( "a", "aa", "aaa") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("aaa") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The longest string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/longest_1.xq
+ :)
+declare function con:longest ( $s as xs:string* ) as xs:string? {
+ let $aux := for $str in $s order by string-length($str) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single shortest string, in terms of the number of characters, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> shortest( ( "a", "aa", "aaa") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The shortest string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/shortest_1.xq
+ :)
+declare function con:shortest( $s as xs:string* ) as xs:string? {
+ let $aux := for $str in $s order by string-length($str) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single longest string, in terms of the number of tokens, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-tokens( ( "a b c", "a b", "a"), " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a b c") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The longest string in the input sequence, in terms of the number of tokens.
+ : @example test/Queries/data-cleaning/consolidation/most-tokens.xq
+ :)
+declare function con:most-tokens ( $s as xs:string*, $r as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by count(tokenize($str,$r)) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single shortest string, in terms of the number of tokens, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-tokens( ( "a b c", "a b", "a"), " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The shortest string in the input sequence, in terms of the number of tokens.
+ : @example test/Queries/data-cleaning/consolidation/least-tokens.xq
+ :)
+declare function con:least-tokens ( $s as xs:string*, $r as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by count(tokenize($str,$r)) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the strings from an input sequence of strings that match a particular regular expression.
+ :
+ : <br/>
+ : Example usage : <pre> matching( ( "a A b", "c AAA d", "e BB f"), "A+" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "a A b", "c AAA d") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r The regular expression to be used in the matching.
+ : @return The strings in the input sequence that match the input regular expression.
+ : @example test/Queries/data-cleaning/consolidation/matching_1.xq
+ :)
+declare function con:matching ( $s as xs:string*, $r as xs:string ) as xs:string* {
+ for $str in $s where matches($str,$r) return $str
+};
+
+(:~
+ : Returns the single string, from an input sequence of strings, that appears more frequently as part
+ : of the other strings in the sequence. If no such string exists, the function returns an empty sequence.
+ : If more then one answer is possible, the function returns the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> super-string( ( "aaa bbb ccc", "aaa bbb", "aaa ddd", "eee fff" ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "aaa bbb" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The string that appears more frequently as part of the other strings in the sequence.
+ : @example test/Queries/data-cleaning/consolidation/superstring_1.xq
+ :)
+declare function con:superstring ( $s as xs:string* ) as xs:string? {
+ let $aux :=
+ for $str in $s
+ let $cnt := count ( for $str2 in $s return if(contains($str2,$str)) then $str else () )
+ where $cnt > 1
+ order by $cnt descending
+ return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single most similar string, in terms of the edit distance metric towards an input string,
+ : in a sequence of strings provided as input. If more than one string has a maximum similarity (a minimum
+ : value for the edit distance metric), the function return the first string according to the order of the
+ : input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "aaabbb" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $m The string towards which we want to measure the edit distance.
+ : @return The most similar string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq
+ :)
+declare function con:most-similar-edit-distance ( $s as xs:string*, $m as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by simc:edit-distance($str,$m) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single least similar string, in terms of the edit distance metric towards an input string,
+ : in a sequence of strings provided as input. If more than one string has a minimum similarity (a maximum
+ : value for the edit distance metric), return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "eeefff" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $m The string towards which we want to measure the edit distance.
+ : @return The least similar string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq
+ :)
+declare function con:least-similar-edit-distance ( $s as xs:string*, $m as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by simc:edit-distance($str,$m) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single node having the largest number of descending elements (sub-elements at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-elements( ( <a><b/></a>, <a/>, <b/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/></a>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-elements.xq
+ :)
+declare function con:most-elements ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::element()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of descending attributes (attributes at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a att1="a1" att2="a2"/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-attributes.xq
+ :)
+declare function con:most-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::*/attribute()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of descending nodes (sub-nodes at any given depth) in a
+ : sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-nodes( ( <a><b/></a>, <a/>, <b/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/></a>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-nodes.xq
+ :)
+declare function con:most-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::node()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending elements (sub-elements at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-elements( ( <a><b/></a>, <b><c/></b>, <d/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<d/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-elements.xq
+ :)
+declare function con:least-elements ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::element()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending attributes (attributes at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<c/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-attributes.xq
+ :)
+declare function con:least-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::*/attribute()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending nodes (sub-nodes at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<d/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-nodes.xq
+ :)
+declare function con:least-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::node()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending elements (sub-elements at any
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-elements( ( <a><b/><c/><d/></a>, <a><b/><b/><c/></a>, <a/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/><c/><d/></a>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-elements.xq
+ :)
+declare function con:most-distinct-elements ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::element())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending attributes (attributes at any
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-attributes( ( <a att1="a1" att2="a2" att3="a3"/>, <a att1="a1" att2="a2"><b att2="a2" /></a>, <c/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a att1="a1" att2="a2" att3="a3"/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq
+ :)
+declare function con:most-distinct-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::*/attribute())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending nodes (sub-nodes at any given depth) in
+ : a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-nodes( ( <a><b/></a>, <a><a/></a>, <b/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/></a>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq
+ :)
+declare function con:most-distinct-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::node())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending elements (sub-elements at any
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-elements( ( <a><b/></a>, <b><c/></b>, <d/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<d/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-elements.xq
+ :)
+declare function con:least-distinct-elements ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::element())) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending attributes (attributes at any
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<c/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq
+ :)
+declare function con:least-distinct-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::*/attribute())) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending nodes (sub-nodes at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<d/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq
+ :)
+declare function con:least-distinct-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::node())) return $str)[1]
+};
+
+(:~
+ : Returns the elements from an input sequence of elements that, when matched to a given set of XPath expressions,
+ : produce a non-empty set of nodes in all the cases.
+ :
+ : <br/>
+ : Example usage : <pre> all-xpaths( ( <a><b/></a>, <c><d/></c>, <d/>), (".//b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/></a>) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The elements that, when matched to the given set of XPath expressions, always return a non-empty set of nodes.
+ :)
+declare function con:all-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ for $str in set:distinct($s)
+ where every $path in $paths satisfies count(
+ x:eval( concat( "<xml>",
+ x:serialize ( $str , <options omit-xml-declaration="true" /> ),
+ if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") ,
+ $path) ) ) > 0
+ return $str
+ :)
+ ""
+};
+
+(:~
+ : Returns the elements from a sequence of elements that, when matched to a given set of XPath expressions,
+ : produce a non-empty set of nodes for some of the cases.
+ :
+ : <br/>
+ : Example usage : <pre> some-xpaths( ( <a><b/></a>, <d><c/></d>, <d/>), (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( <a><b/></a> , <d><c/></d> ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The elements that, when matched to the given set of XPath expressions, return a non-empty set of nodes
+ : for at least one of the cases.
+ :)
+declare function con:some-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ for $str in set:distinct($s)
+ where some $path in $paths satisfies count(
+ x:eval( concat( "<xml>",
+ x:serialize ( $str , <options omit-xml-declaration="true" /> ),
+ if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") ,
+ $path) ) ) > 0
+ return $str
+ :)
+ ""
+};
+
+(:~
+ : Returns the single element from an input sequence of elements that matches the largest number of
+ : XPath expressions from a given set, producing a non-empty set of nodes.
+ : If more then one answer is possible, return the first element according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-xpaths( ( <a><b/></a>, <d><c/><b/></d>, <d/>) , (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( <d><c/><b/></d> ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The element that matches the largest number of XPath expressions producing a non-empty set of nodes.
+ :)
+
+declare function con:most-xpaths ( $s as element()* , $paths as xs:string* ) {
+ (:
+ (
+ for $str in set:distinct($s)
+ let $cnt := sum( for $path in $paths where count(
+ x:eval( concat( "<xml>",
+ x:serialize ( $str , <options omit-xml-declaration="true" /> ),
+ if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") ,
+ $path) ) ) > 0 return 1 )
+ order by $cnt descending
+ return $str
+ )[1]
+ :)
+ ""
+};
+
+(:~
+ : Returns the single element from an input sequence of elements that matches the smallest number of
+ : XPath expressions from a given set, producing a non-empty set of nodes.
+ : If more then one answer is possible, return the first element according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-xpaths( ( <a><b/></a>, <d><c/><b/></d>, <d/>) , (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( $lt;d/> ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The element that matches the smallest number of XPath expressions producing a non-empty set of nodes.
+ :)
+
+declare function con:least-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ (
+ for $str in set:distinct($s)
+ let $cnt := sum( for $path in $paths where count(
+ x:eval( concat( "<xml>",
+ x:serialize ( $str , <options omit-xml-declaration="true" /> ),
+ if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") ,
+ $path) ) ) > 0 return 1 )
+ order by $cnt
+ return $str
+ )[1]
+ :)
+ ""
+};
+
+(:~
+ : Returns the nodes from an input sequence of nodes that validate against a given XML Schema.
+ :
+ : <br/>
+ : Example usage : <pre> validating-schema ( ( <a/> , <b/> ), <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"><xs:element name="a" /></xs:schema> ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( <a/> ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $schema An element encoding an XML Schema.
+ : @return The nodes that validate against the XML Schema.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function con:validating-schema ( $s as element()*, $schema as element() ) {
+ false()
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,359 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data conversion functions for processing calendar dates,
+ : temporal values, currency values, units of measurement, location names and postal addresses.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+declare namespace exref = "http://www.ecb.int/vocabulary/2002-08-01/eurofxref";;
+declare namespace ann = "http://www.zorba-xquery.com/annotations";;
+
+import schema namespace wp = 'http://api.whitepages.com/schema/';
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~ The key to be used when accessing the White Pages Web service :)
+declare variable $conversion:key := "06ea2f21cc15602b6a3e242e3225a81a";
+
+(:~
+ : Uses a White-pages Web service to discover information about a given name,
+ : returning a sequence of strings for the phone numbers associated to the name.
+ :
+ : <br/>
+ : Example usage : <pre> phone-from-user ('Maria Lurdes') </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (716) 686-4500 </pre>
+ :
+ : @param $name The name of person or organization.
+ : @return A sequence of strings for the phone numbers associated to the name.
+ : @example test/Queries/data-cleaning/conversion/phone-from-user.xq
+ :)
+declare %ann:nondeterministic function conversion:phone-from-user ( $name as xs:string) as xs:string*{
+ let $name-value := replace($name, " ", "%20")
+ let $url := concat("http://api.whitepages.com/find_person/1.0/?name=",$name-value,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ return
+ $doc/wp:wp/wp:listings/wp:listing/wp:phonenumbers/wp:phone/wp:fullphone/text()
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given name,
+ : returning a sequence of strings for the addresses associated to the name.
+ :
+ : <br/>
+ : Example usage : <pre> address-from-user ('Maria Lurdes') </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 222 E 53rd St, Los Angeles, CA, US </pre>
+ : <pre> 3362 Walden Ave, Depew, NY, US </pre>
+ :
+ : @param $name The name of person or organization.
+ : @return A sequence of strings for the addresses associated to the name.
+ : @example test/Queries/data-cleaning/conversion/address-from-user.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-user ( $name as xs:string) as xs:string*{
+ let $name-value := replace($name, " ", "%20")
+ let $url := concat("http://api.whitepages.com/find_person/1.0/?name=",$name-value,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ for $a in $doc/wp:wp/wp:listings/wp:listing/wp:address
+ let $fullstreet := $a/wp:fullstreet/text()
+ let $city := $a/wp:city/text()
+ let $state := $a/wp:state/text()
+ let $country := $a/wp:country/text()
+ return concat($fullstreet, ", ", $city, ", ", $state, ", ", $country)
+};
+
+
+(:~
+ : Uses a White-pages Web service to discover information about a given phone number,
+ : returning a sequence of strings for the name associated to the phone number.
+ :
+ : <br/>
+ : Example usage : <pre> user-from-phone ('8654582358') </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> Homer Simpson </pre>
+ : <pre> Sue M Simpson </pre>
+ :
+ : @param $phone-number A string with 10 digits corresponding to the phone number.
+ : @return A sequence of strings for the person or organization's name associated to the phone number.
+ : @example test/Queries/data-cleaning/conversion/user-from-phone.xq
+ :)
+declare %ann:nondeterministic function conversion:user-from-phone ( $phone-number as xs:string) as xs:string*{
+ let $url := concat("http://api.whitepages.com/reverse_phone/1.0/?phone=",$phone-number,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ return $doc/wp:wp/wp:listings/wp:listing/wp:displayname/text()
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given phone number,
+ : returning a string for the address associated to the phone number.
+ :
+ : <br/>
+ : Example usage : <pre> address-from-phone ('8654582358') </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 4610 Harrison Bend Rd, Loudon, TN, US </pre>
+ :
+ : @param $phone-number A string with 10 digits corresponding to the phone number.
+ : @return A string for the addresses associated to the phone number.
+ : @example test/Queries/data-cleaning/conversion/address-from-phone.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-phone ( $phone-number as xs:string) as xs:string*{
+ let $url := concat("http://api.whitepages.com/reverse_phone/1.0/?phone=",$phone-number,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ let $addresses :=
+ for $a in $doc/wp:wp/wp:listings/wp:listing/wp:address
+ let $fullstreet := $a/wp:fullstreet/text()
+ let $city := $a/wp:city/text()
+ let $state := $a/wp:state/text()
+ let $country := $a/wp:country/text()
+ return concat($fullstreet, ", ", $city, ", ", $state, ", ", $country)
+ return distinct-values($addresses)
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given address,
+ : returning a sequence of strings for the names associated to the address.
+ :
+ : <br/>
+ : Example usage : <pre> user-from-address('5655 E Gaskill Rd, Willcox, AZ, US') </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> Stan Smith </pre>
+ :
+ : @param $address A string corresponding to the address (ex: 5655 E Gaskill Rd, Willcox, AZ, US).
+ : @return A sequence of strings for the person or organization's names associated to the address.
+ : @example test/Queries/data-cleaning/conversion/user-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:user-from-address ( $address as xs:string) as xs:string*{
+ let $tokens := tokenize ($address, ",")
+ let $token-full-street := $tokens[position()=1]
+ let $state :=
+ if (count($tokens) = 4)
+ then replace($tokens[position()=3], " ", "")
+ else
+ if (count($tokens) = 5)
+ then replace($tokens[position()=4], " ", "")
+ else()
+ let $house := tokenize($token-full-street, " ")[position()=1]
+ let $street := replace(replace($token-full-street, "[0-9]+[ ]", ""), " ", "%20")
+ let $url := concat("http://api.whitepages.com/reverse_address/1.0/?house=",$house, ";street=",$street, ";state=",$state,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ return $doc/wp:wp/wp:listings/wp:listing/wp:displayname/text()
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given address,
+ : returning a sequence of strings for the phone number associated to the address.
+ :
+ : <br/>
+ : Example usage : <pre> phone-from-address('5655 E Gaskill Rd, Willcox, AZ, US') </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (520) 824-3160 </pre>
+ :
+ : @param $address A string corresponding to the address (ex: 5655 E Gaskill Rd, Willcox, AZ, US).
+ : @return A sequence of strings for the phone number or organization's names associated to the address.
+ : @example test/Queries/data-cleaning/conversion/phone-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:phone-from-address ( $address as xs:string) as xs:string*{
+ let $tokens := tokenize ($address, ",")
+ let $token-full-street := $tokens[position()=1]
+ let $state :=
+ if (count($tokens) = 4)
+ then replace($tokens[position()=3], " ", "")
+ else
+ if (count($tokens) = 5)
+ then replace($tokens[position()=4], " ", "")
+ else()
+ let $house := replace($token-full-street, "([A-Za-z]+|[0-9]+[A-Za-z][A-Za-z]|[ ]+)", "")
+ let $street-w-space := replace($token-full-street, $house, "")
+ let $street :=
+ if (substring($street-w-space, 1, 1) = " ")
+ then substring($street-w-space, 2)
+ else
+ if(substring($street-w-space, string-length($street-w-space), 1) = " ")
+ then substring($street-w-space, 1, string-length($street-w-space)-1)
+ else ()
+ let $street-form := replace($street, " ", "%20")
+ let $url := concat("http://api.whitepages.com/reverse_address/1.0/?house=",$house, ";street=",$street-form, ";state=",$state,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ return $doc/wp:wp/wp:listings/wp:listing/wp:phonenumbers/wp:phone/wp:fullphone/text()(: if($state = "TN") then "iguais" else "dif":)
+};
+
+(:~
+ : Conversion function for units of measurement, acting as a wrapper over the CuppaIT WebService.
+ : <br/>
+ : WebService documentation at http://www.cuppait.com/UnitConversionGateway-war/UnitConversion?format=XML
+ :
+ : <br/>
+ : Example usage : <pre> unit-convert ( 1 , "Distance", "mile", "kilometer" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.609344 </pre>
+ :
+ : @param $v The amount we wish to convert.
+ : @param $t The type of metric (e.g., "Distance")
+ : @param $m1 The source measurement unit metric (e.g., "meter")
+ : @param $m2 The target measurement unit metric (e.g., "mile")
+ : @return The value resulting from the conversion
+ : @error conversion:notsupported if the type of metric, the source unit or the target unit are not known to the service.
+ : @see http://www.cuppait.com/UnitConversionGateway-war/UnitConversion?format=XML
+ : @example test/Queries/data-cleaning/conversion/unit-convert.xq
+ :)
+declare %ann:nondeterministic function conversion:unit-convert ( $v as xs:double, $t as xs:string, $m1 as xs:string, $m2 as xs:string ) {
+ let $url := "http://www.cuppait.com/UnitConversionGateway-war/UnitConversion?format=XML";
+ let $ctype := concat("ctype=",$t)
+ let $cfrom := concat("cfrom=",$m1)
+ let $cto := concat("cto=",$m2)
+ let $camount := concat("camount=",$v)
+ let $par := string-join(($url,$ctype,$cfrom,$cto,$camount),"&")
+ let $result := data(http:get-node($par)[2])
+ return if (matches(data($result),"-?[0-9]+(\.[0-9]+)?")) then data($result)
+ else (error(QName('http://www.zorba-xquery.com/modules/data-cleaning/conversion', 'conversion:notsupported'), data($result)))
+};
+
+(:~
+ : Placename to geospatial coordinates converter, acting as a wrapper over the Yahoo! geocoder service.
+ :
+ : <br/>
+ : Example usage : <pre> geocode-from-address ( ("Lisboa", "Portugal") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( 38.725735 , -9.15021 ) </pre>
+ :
+ : @param $q A sequence of strings corresponding to the different components (e.g., street, city, country, etc.) of the place name.
+ : @return The pair of latitude and longitude coordinates associated with the input address.
+ : @example test/Queries/data-cleaning/conversion/geocode-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:geocode-from-address ( $q as xs:string* ) as xs:double* {
+ let $id := ""
+ let $url := "http://where.yahooapis.com/geocode?q=";
+ let $q2 := string-join(for $i in $q return translate($i," ","+"),",")
+ let $call := concat($url,$q2,"&appid=",$id)
+ let $doc := http:get-node($call)[2]
+ return ( xs:double($doc/ResultSet/Result/latitude/text()) , xs:double($doc/ResultSet/Result/longitude/text()) )
+};
+
+(:~
+ : Geospatial coordinates to placename converter, acting as a wrapper over the Yahoo! reverse geocoder service.
+ :
+ : <br/>
+ : Example usage : <pre> address-from-geocode ( 38.725735 , -9.15021 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( 'Portugal' , 'Lisbon' , 'praca Marques de Pombal' ) </pre>
+ :
+ : @param $lat Geospatial latitude.
+ : @param $lon Geospatial longitude.
+ : @return The sequence of strings corresponding to the different components (e.g., street, city, country, etc.) of the place name that corresponds to the input geospatial coordinates.
+ : @example test/Queries/data-cleaning/conversion/address-from-geocode.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-geocode ( $lat as xs:double, $lon as xs:double ) as xs:string* {
+ let $id := ""
+ let $url := "http://where.yahooapis.com/geocode?q=";
+ let $q := concat($lat,",+",$lon)
+ let $call := concat($url,$q,"&gflags=R&appid=",$id)
+ let $doc := http:get-node($call)[2]
+ return distinct-values( (if (string-length($doc//xs:string(*:country)) > 0) then ($doc//xs:string(*:country)) else (),
+ if (string-length($doc//xs:string(*:state)) > 0) then ($doc//xs:string(*:state)) else (),
+ if (string-length($doc//xs:string(*:county)) > 0) then ($doc//xs:string(*:county)) else (),
+ if (string-length($doc//xs:string(*:city)) > 0) then ($doc//xs:string(*:city)) else (),
+ if (string-length($doc//xs:string(*:neighborhood)) > 0) then ($doc//xs:string(*:neighborhood)) else (),
+ if (string-length($doc//xs:string(*:street)) > 0) then ($doc//xs:string(*:street)) else (),
+ if (string-length($doc//xs:string(*:house)) > 0) then ($doc//xs:string(*:house)) else () ) )
+};
+
+(:~
+ : Currency conversion function, acting as a wrapper over the WebService from the European Central Bank.
+ :
+ : WebService documentation at http://www.ecb.int/stats/exchange/eurofxref/html/index.en.html
+ :
+ : <br/>
+ : Example usage : <pre> currency-convert ( 1, "USD", "EUR", "2011-01-18" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.747887218607434 </pre>
+ :
+ : @param $v The amount we wish to convert.
+ : @param $m1 The source currency (e.g., "EUR").
+ : @param $m2 The target currency (e.g., "USD").
+ : @param $date The reference date.
+ : @return The value resulting from the conversion.
+ : @error conversion:notsupported if the date, the source currency type or the target currency type are not known to the service.
+ : @see http://www.ecb.int/stats/exchange/eurofxref/html/index.en.html
+ : @example test/Queries/data-cleaning/conversion/currency-convert.xq
+ :)
+declare %ann:nondeterministic function conversion:currency-convert ( $v as xs:double, $m1 as xs:string, $m2 as xs:string, $date as xs:string ) {
+ let $daily := "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml";
+ let $hist := "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist.xml";
+ let $doc := if (string-length($date) = 0) then http:get-node($daily)[2] else
+ ((for $a in http:get-node($hist)[2]//exref:Cube[
+ xs:string(@time)<=$date] order by $a/xs:string(@time) descending return $a)[1])
+ let $toEUR := if ( $m1="EUR" ) then (xs:double(1.0)) else ( $doc//exref:Cube[xs:string(@currency)=$m1]/xs:double(@rate) )
+ let $fromEUR := if ( $m2="EUR" ) then (xs:double(1.0)) else ( $doc//exref:Cube[xs:string(@currency)=$m2]/xs:double(@rate) )
+ let $result := ($v div $toEUR) * $fromEUR
+ return if (matches(string($result),"-?[0-9]+(\.[0-9]+)?")) then ($result)
+ else (error(QName('http://www.zorba-xquery.com/modules/data-cleaning/conversion', 'conversion:notsupported'), data($result)))
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings
+ : for the phone numbers associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the phone numbers associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:phone-from-domain ( $domain as xs:string ) {
+ ()
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings
+ : for the addresses associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the addresses associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:address-from-domain ( $domain as xs:string ) {
+ ()
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings
+ : for the person or organization names associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the person or organization names associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:name-from-domain ( $domain as xs:string ) {
+ ()
+};
+
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,223 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides hybrid string similarity functions, combining the properties of
+ : character-based string similarity functions and token-based string similarity functions.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation,
+ : although the module requires the trigonometic functions of XQuery 1.1 or a math extension
+ : function such as sqrt($x as numeric) for computing the square root.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+(: In the QizX os Saxon XQuery engines, it is possible to call external functions from the Java math library :)
+(: declare namespace math = "java:java.lang.Math"; :)
+declare namespace math = "http://www.w3.org/2005/xpath-functions/math";;
+
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Soundex phonetic similarity function is used to discover token identity, which is equivalent to saying that
+ : this function returns the cosine similarity coefficient between sets of Soundex keys.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-soundex("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets of Soundex keys extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq
+ :)
+declare function simh:soft-cosine-tokens-soundex ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $keys1 := for $kt1 in tokenize($s1,$r) return simp:soundex-key($kt1)
+ let $keys2 := for $kt1 in tokenize($s2,$r) return simp:soundex-key($kt1)
+ return simt:cosine($keys1, $keys2)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Metaphone phonetic similarity function is used to discover token identity, which is equivalent to saying that
+ : this function returns the cosine similarity coefficient between sets of Metaphone keys.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-metaphone("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets Metaphone keys extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq
+ :)
+declare function simh:soft-cosine-tokens-metaphone ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $keys1 := for $kt1 in tokenize($s1,$r) return simp:metaphone-key($kt1)
+ let $keys2 := for $kt1 in tokenize($s2,$r) return simp:metaphone-key($kt1)
+ return simt:cosine($keys1, $keys2)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Edit Distance similarity function is used to discover token identity, and tokens having an edit distance
+ : bellow a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-edit-distance("The FLWOR Foundation", "FLWOR Found.", " +", 0 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.408248290463863 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ :)
+declare function simh:soft-cosine-tokens-edit-distance ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:integer ) as xs:double {
+(:
+ let $tokens1 := tokenize($s1,$r)
+ let $tokens2 := tokenize($s2,$r)
+ let $tokens := ($tokens1, $tokens2)
+ let $vocab := for $a at $apos in $tokens
+ where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:edit-distance($ba,$a) <= $t)
+ return $a
+ let $freq1 := for $a1 in $vocab return count($tokens1[simc:edit-distance(.,$a1) <= $t])
+ let $freq2 := for $a2 in $vocab return count($tokens2[simc:edit-distance(.,$a2) <= $t])
+ let $freq1pow := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult := for $freq at $pos in $freq1 return $freq * $freq2[$pos]
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+ :)
+ xs:double(0)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Jaro similarity function is used to discover token identity, and tokens having a Jaro similarity above
+ : a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-jaro("The FLWOR Foundation", "FLWOR Found.", " +", 1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq
+ :)
+declare function simh:soft-cosine-tokens-jaro ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:double ) as xs:double {
+ let $tokens1 := tokenize($s1,$r)
+ let $tokens2 := tokenize($s2,$r)
+ let $tokens := ($tokens1, $tokens2)
+ let $vocab := for $a at $apos in $tokens
+ where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:jaro($ba,$a) >= $t)
+ return $a
+ let $freq1 := for $a1 in $vocab return count($tokens1[simc:jaro(.,$a1) >= $t])
+ let $freq2 := for $a2 in $vocab return count($tokens2[simc:jaro(.,$a2) >= $t])
+ let $freq1pow := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult := for $freq at $pos in $freq1 return $freq * $freq2[$pos]
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Jaro-Winkler similarity function is used to discover token identity, and tokens having a Jaro-Winkler
+ : similarity above a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-jaro-winkler("The FLWOR Foundation", "FLWOR Found.", " +", 1, 4, 0.1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.45 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @param $prefix The number of characters to consider when testing for equal prefixes with the Jaro-Winkler metric.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes with the Jaro-Winkler metric.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq
+ :)
+declare function simh:soft-cosine-tokens-jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:double, $prefix as xs:integer?, $fact as xs:double? ) as xs:double {
+ let $tokens1 := tokenize($s1,$r)
+ let $tokens2 := tokenize($s2,$r)
+ let $tokens := ($tokens1, $tokens2)
+ let $vocab := for $a at $apos in $tokens
+ where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:jaro-winkler($ba,$a,$prefix,$fact) >= $t)
+ return $a
+ let $freq1 := for $a1 in $vocab return count($tokens1[simc:jaro-winkler(.,$a1,$prefix,$fact) >= $t])
+ let $freq2 := for $a2 in $vocab return count($tokens2[simc:jaro-winkler(.,$a2,$prefix,$fact) >= $t])
+ let $freq1pow := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult := for $freq at $pos in $freq1 return $freq * $freq2[$pos]
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the Monge-Elkan similarity coefficient between two strings, using the Jaro-Winkler
+ : similarity function to discover token identity.
+ :
+ : <br/>
+ : Example usage : <pre> monge-elkan-jaro-winkler("Comput. Sci. and Eng. Dept., University of California, San Diego", "Department of Computer Scinece, Univ. Calif., San Diego", 4, 0.1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.992 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $prefix The number of characters to consider when testing for equal prefixes with the Jaro-Winkler metric.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes with the Jaro-Winkler metric.
+ : @return The Monge-Elkan similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq
+ :)
+declare function simh:monge-elkan-jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $prefix as xs:integer, $fact as xs:double ) as xs:double{
+ let $s1tokens := tokenize($s1, " ")
+ let $s2tokens := tokenize($s1, " ")
+ let $length := min((count($s1tokens), count($s2tokens)))
+ let $res := for $s1n in $s1tokens
+ return max(for $s2n in $s2tokens return simc:jaro-winkler($s1n,$s2n,$prefix,$fact))
+ return (1 div $length) * sum($res)
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1382 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data normalization functions for processing calendar dates,
+ : temporal values, currency values, units of measurement, location names and postal addresses.
+ :
+ : These functions are particularly useful for converting different data representations into cannonical formats.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";;
+
+declare namespace ann = "http://www.zorba-xquery.com/annotations";;
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Converts a given string representation of a date value into a date representation valid according
+ : to the corresponding XML Schema type.
+ :
+ : <br/>
+ : Example usage : <pre> to-date ( "24OCT2002" , "%d%b%Y" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 2002-10-24 </pre>
+ :
+ : @param $sd The string representation for the date
+ : @param $format An optional parameter denoting the format used to represent the date in the string, according to a
+ : sequence of conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed
+ : by a single letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion
+ : specification is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ : <pre>
+ : '%a' Abbreviated weekday name in the current locale.<br/>
+ : '%A' Full weekday name in the current locale.<br/>
+ : '%b' Abbreviated month name in the current locale.<br/>
+ : '%B' Full month name in the current locale.<br/>
+ : '%d' Day of the month as decimal number (01-31).<br/>
+ : '%j' Day of year as decimal number (001-366).<br/>
+ : '%m' Month as decimal number (01-12).<br/>
+ : '%U' Week of the year as decimal number (00-53) using Sunday as the first day of the week (and typically with the first Sunday of the year as day 1 of week 1). This is the US convention.<br/>
+ : '%w' Weekday as decimal number (0-6, Sunday is 0).<br/>
+ : '%W' Week of the year as decimal number (00-53) using Monday as the first day of the week (and typically with the first Monday of the year as day 1 of week 1). This is the UK convention.<br/>
+ : '%x' Date, locale-specific.<br/>
+ : '%y' Year without century (00-99).<br/>
+ : '%Y' Year with century.<br/>
+ : '%C' Century (00-99): the integer part of the year divided by 100.<br/>
+ : '%D' Locale-specific date format such as '%m/%d/%y'.<br/>
+ : '%e' Day of the month as decimal number (1-31), with a leading pace for a single-digit number.<br/>
+ : '%F' Equivalent to %Y-%m-%d (the ISO 8601 date format).<br/>
+ : '%g' The last two digits of the week-based year (see '%V').<br/>
+ : '%G' The week-based year (see '%V') as a decimal number.<br/>
+ : '%h' Equivalent to '%b'.<br/>
+ : '%u' Weekday as a decimal number (1-7, Monday is 1).<br/>
+ : '%V' Week of the year as decimal number (00-53) as defined in ISO 8601. If the week (starting on Monday) containing 1 January has four or more days in the new year, then it is considered week 1. Otherwise, it is the last week of the previous year, and the next week is week 1.
+ :</pre>
+ :
+ : @return The date value resulting from the conversion.
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :)
+declare function normalization:to-date ( $sd as xs:string, $format as xs:string? ) as xs:string{
+(:
+ let $dictionary := normalization:month-dictionary()
+ let $format-tokens := tokenize($format, "%")[position()>1]
+ let $sd-tokens :=
+ if (contains($sd, "-") or contains($sd, "/") or contains($sd, " "))
+ then tokenize ($sd, "[ \-/]")
+ else let $ydtoken := tokenize(replace($sd, "[A-Za-z]", " "), " ")
+ let $ft := $ydtoken[position()=1]
+ let $lt := $ydtoken[last()]
+ let $mtoken := replace($sd, "[0-9]", "") return ($ft, $mtoken, $lt)
+ return
+ if (count($sd-tokens)>1)
+ then
+ let $year :=
+ if (count(index-of($format-tokens, "F")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "D")) != 0) then concat("19", string($sd-tokens[position() = 3]))
+ else
+
+ if (count(index-of($format-tokens, "Y")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "Y")]) else
+
+ if (count(index-of($format-tokens, "y")) != 0)
+ then
+ if(count(index-of($format-tokens, "C")) !=0)
+ then concat(string(number(string($sd-tokens[position() = index-of($format-tokens, "C")]))-1), string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+ else
+ concat("19", string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+ else "YND"
+
+ let $month :=
+ if (count(index-of($format-tokens, "h")) != 0)
+ then string($dictionary//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "h")]]/@value) else
+
+ if (count(index-of($format-tokens, "b")) != 0)
+ then string($dictionary//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "b")]]/@value)
+ else
+
+ if (count(index-of($format-tokens, "B")) != 0)
+ then string($dictionary//month[lower-case(@name) =
+ lower-case($sd-tokens[position() = index-of($format-tokens, "B")])]/@value)
+
+ else
+
+ if (count(index-of($format-tokens, "F")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "m")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "m")])
+
+ else "MND"
+
+ let $day :=
+ if (count(index-of($format-tokens, "F")) != 0)
+ then string($sd-tokens[position() = 3]) else
+
+ if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "d")) != 0)
+ then $sd-tokens[position() = index-of($format-tokens, "d")] else
+
+ if (count(index-of($format-tokens, "e")) != 0)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+ else "DND"
+
+ let $result := concat($year, "-", $month, "-", $day)
+
+ return
+
+ if (matches(string($result),"[0-9]+-((0[1-9])|(1[0-2]))-((0[1-9])|([12][0-9])|(3[01]))"))
+ then $result
+ else
+ (error(QName('http://www.zorba-xquery.com/modules/data-cleaning/normalization',
+ 'err:notsupported'),data(concat($result, " - ", concat("year: ", $year), concat(" month: ", $month), concat(" day:", $day)))))
+ else()
+ :)""
+};
+
+(:~
+ : Converts a given string representation of a time value into a time representation valid according to
+ : the corresponding XML Schema type.
+ :
+ : <br/>
+ : Example usage : <pre> to-time ( "09 hours 10 minutes" , "%H hours %M minutes" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 09:10:00 </pre>
+ :
+ : @param $sd The string representation for the time.
+ : @param $format An optional parameter denoting the format used to represent the time in the string, according to a sequence of
+ : conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed by a single
+ : letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion specification
+ : is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ :
+ : <pre>
+ : '%H' Hours as decimal number (00-23).<br/>
+ : '%I' Hours as decimal number (01-12).<br/>
+ : '%M' Minute as decimal number (00-59).<br/>
+ : '%p' AM/PM indicator in the locale. Used in conjunction with '%I' and *not* with '%H'.<br/>
+ : '%S' Second as decimal number (00-61), allowing for up to two leap-seconds.<br/>
+ : '%X' Time, locale-specific.<br/>
+ : '%z' Offset from Greenwich, so '-0900' is 9 hours west of Greenwich.<br/>
+ : '%Z' Time zone as a character string.<br/>
+ : '%k' The 24-hour clock time with single digits preceded by a blank.<br/>
+ : '%l' The 12-hour clock time with single digits preceded by a blank.<br/>
+ : '%r' The 12-hour clock time (using the locale's AM or PM).<br/>
+ : '%R' Equivalent to '%H:%M'.<br/>
+ : '%T' Equivalent to '%H:%M:%S'.<br/>
+ :</pre>
+ :
+ : @return The time value resulting from the conversion.
+ :)
+declare function normalization:to-time ( $sd as xs:string, $format as xs:string? ) as xs:string{
+ let $timezoneDict := normalization:timeZone-dictionary()
+ let $format-tokens := tokenize($format, "%")[position()>1]
+ let $sd-tokens :=
+ if (contains($sd, ":") or contains($sd, ".") or contains($sd, " "))
+ then tokenize ($sd, "[ :\.]")
+ else ()
+ return
+ if (count($sd-tokens)>1)
+ then
+ let $hours :=
+ if (count(index-of($format-tokens, "T")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),1,2)
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "H")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "H")]) else
+
+ if (count(index-of($format-tokens, "k")) != 0)
+ then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "k")]))=1)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "k")]))
+ else string($sd-tokens[position() = index-of($format-tokens, "k")])
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then
+ if(lower-case(string($sd-tokens[position() = 4]))="am")
+ then string($sd-tokens[position() = 1])
+ else if(lower-case(string($sd-tokens[position() = 4]))="pm")
+ then if(string($sd-tokens[position() = 1])="12") then 12
+ else string(number(string($sd-tokens[position() = 1]))+12)
+ else()
+
+ else
+
+ if (count(index-of($format-tokens, "I")) != 0)
+ then
+ if(count(index-of($format-tokens, "p")) !=0)
+ then if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="am")
+ then string($sd-tokens[position() = index-of($format-tokens, "I")])
+ else if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="pm")
+ then if (string($sd-tokens[position() = index-of($format-tokens, "I")])="12")
+ then "12"
+ else string(number(string($sd-tokens[position() = index-of($format-tokens, "I")]))+12)
+ else()
+ else()
+
+ else
+ if (count(index-of($format-tokens, "l")) != 0)
+ then
+ if(count(index-of($format-tokens, "p")) !=0)
+ then if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="am")
+ then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "l")]))=1)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "l")]))
+ else string($sd-tokens[position() = index-of($format-tokens, "l")])
+
+ else if (lower-case(string($sd-tokens[position() = index-of($format-tokens, "p")]))="pm")
+ then if (string($sd-tokens[position() = index-of($format-tokens, "l")])="12")
+ then "12"
+ else string(number(string($sd-tokens[position() = index-of($format-tokens, "l")]))+12)
+ else()
+
+ else ()
+
+
+ else "HND"
+
+ let $minutes :=
+
+ if (count(index-of($format-tokens, "T")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),4,2)
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "M")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "M")])
+
+ else "MND"
+
+ let $seconds :=
+ if (count(index-of($format-tokens, "T")) != 0)
+ then string($sd-tokens[position() = 3]) else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),7,2)
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then string($sd-tokens[position() = 3])
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then "00"
+ else
+
+ if (count(index-of($format-tokens, "S")) != 0)
+ then $sd-tokens[position() = index-of($format-tokens, "S")] else
+
+ if (count(index-of($format-tokens, "e")) != 0)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+ else "SND"
+
+ let $result :=
+
+ if (count(index-of($format-tokens, "Z")) != 0)
+ then
+ if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() = index-of($format-tokens, "Z")]]),1,1)='+')
+ then let $complement :=
+ if (number($minutes)+number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)) > 59)
+ then 1
+ else 0
+
+ let $rhours :=
+ if (string-length(string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+
+ let $rminutes :=
+ if (string-length(string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+
+
+ return concat($rhours, ":", $rminutes, ":", $seconds)
+ else
+
+ if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),1,1)='-')
+ then
+ let $complement :=
+ if (number($minutes)-number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)) < 0)
+ then -1
+ else 0
+
+ let $rhours :=
+ if( ((number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24) >= 0 )
+ then
+ if (string-length(string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else
+ if (string-length(string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (24 + number($complement) + -(number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)))) mod 24))
+
+ let $rminutes :=
+ if( ((number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60) >= 0 )
+ then
+ if (string-length(string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else
+ if (string-length(string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)))) mod 60)) = 2)
+ then (string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)))) mod 60))
+ else concat("0",
+ string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)))) mod 60))
+
+ return concat($rhours, ":", $rminutes, ":", $seconds)
+ else ()
+ else
+
+ (:z:)
+ if (count(index-of($format-tokens, "z")) != 0)
+ then if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='+')
+ then let $complement :=
+ if (number($minutes)+number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) > 59) then 1
+ else 0
+
+ let $rhours :=
+ if (string-length(string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+
+ let $rminutes :=
+ if (string-length(string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+
+
+ return concat($rhours, ":", $rminutes, ":", $seconds)
+ else
+
+ if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='-')
+ then
+ let $complement :=
+ if (number($minutes)-number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) < 0) then -1
+ else 0
+
+ let $rhours :=
+ if( ((number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24) >= 0 )
+ then
+ if (string-length(string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else
+ if (string-length(string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (24 + number($complement) + -(number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 24))
+
+ let $rminutes :=
+ if( ((number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60) >= 0 )
+ then
+ if (string-length(string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else
+ if (string-length(string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60)) = 2)
+ then (string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60))
+ else concat("0",
+ string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 60))
+
+ return concat($rhours, ":", $rminutes, ":", $seconds)
+ else ()
+ else
+ concat($hours, ":", $minutes, ":", $seconds)
+
+ return
+
+ if (matches(string($result),"(([01][0-9])|(2[0-3])):[0-5][0-9]:[0-5][0-9]"))
+ then $result
+ else
+ (error(QName('http://www.zorba-xquery.com/modules/data-cleaning/normalization',
+ 'err:notsupported'),data(concat($result, " - ", concat("hours: ", $hours), concat(" minutes: ", $minutes), concat(" seconds:", $seconds)))))
+ else()
+
+};
+
+(:~
+ : Converts a given string representation of a dateTime value into a dateTime representation
+ : valid according to the corresponding XML Schema type.
+ :
+ : <br/>
+ : Example usage : <pre> to-dateTime( "24OCT2002 21:22" , "%d%b%Y %H%M" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 2002-20-24T21:22:00 </pre>
+ :
+ : @param $sd The string representation for the dateTime.
+ : @param $format An optional parameter denoting the format used to represent the dateTime in the string, according to a sequence
+ : of conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed by a single
+ : letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion specification
+ : is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ :
+ : <pre>
+ : '%a' Abbreviated weekday name in the current locale.<br/>
+ : '%A' Full weekday name in the current locale.<br/>
+ : '%b' Abbreviated month name in the current locale.<br/>
+ : '%B' Full month name in the current locale.<br/>
+ : '%c' Date and time, locale-specific.<br/>
+ : '%d' Day of the month as decimal number (01-31).<br/>
+ : '%H' Hours as decimal number (00-23).<br/>
+ : '%I' Hours as decimal number (01-12).<br/>
+ : '%j' Day of year as decimal number (001-366).<br/>
+ : '%m' Month as decimal number (01-12).<br/>
+ : '%M' Minute as decimal number (00-59).<br/>
+ : '%p' AM/PM indicator in the locale. Used in conjunction with '%I' and *not* with '%H'.<br/>
+ : '%S' Second as decimal number (00-61), allowing for up to two leap-seconds.<br/>
+ : '%U' Week of the year as decimal number (00-53) using Sunday as the first day 1 of the week (and typically with the first Sunday of the year as day 1 of week 1). This is the US convention.<br/>
+ : '%w' Weekday as decimal number (0-6, Sunday is 0).<br/>
+ : '%W' Week of the year as decimal number (00-53) using Monday as the first day of week (and typically with the first Monday of the year as day 1 of week 1). This is the UK convention.<br/>
+ : '%x' Date, locale-specific.<br/>
+ : '%X' Time, locale-specific.<br/>
+ : '%y' Year without century (00-99).<br/>
+ : '%Y' Year with century.<br/>
+ : '%z' Offset from Greenwich, so '-0900' is 9 hours west of Greenwich.<br/>
+ : '%Z' Time zone as a character string.<br/>
+ : '%C' Century (00-99): the integer part of the year divided by 100.<br/>
+ : '%D' Locale-specific date format such as '%m/%d/%y': ISO C99 says it should be that exact format.<br/>
+ : '%e' Day of the month as decimal number (1-31), with a leading pace for a single-digit number.<br/>
+ : '%F' Equivalent to %Y-%m-%d (the ISO 8601 date format).<br/>
+ : '%g' The last two digits of the week-based year (see '%V').<br/>
+ : '%G' The week-based year (see '%V') as a decimal number.<br/>
+ : '%h' Equivalent to '%b'.<br/>
+ : '%k' The 24-hour clock time with single digits preceded by a blank.<br/>
+ : '%l' The 12-hour clock time with single digits preceded by a blank.<br/>
+ : '%r' The 12-hour clock time (using the locale's AM or PM).<br/>
+ : '%R' Equivalent to '%H:%M'.<br/>
+ : '%T' Equivalent to '%H:%M:%S'.<br/>
+ : '%u' Weekday as a decimal number (1-7, Monday is 1).<br/>
+ : '%V' Week of the year as decimal number (00-53) as defined in ISO 8601. If the week (starting on Monday) containing 1 January has four or more days in the new year, then it is considered week 1. Otherwise, it is the last week of the previous year, and the next week is week 1.
+ :</pre>
+ :
+ : @return The dateTime value resulting from the conversion.
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function normalization:to-dateTime ( $sd as xs:string, $format as xs:string? ) as xs:string {
+(:
+ let $timezoneDict := normalization:timeZone-dictionary()
+ let $monthDict := normalization:month-dictionary()
+ let $format-tokens := tokenize($format, "[ \-%]+")[position()>1]
+ let $sdt :=
+ if (contains($sd, ":") or contains($sd, ".") or contains($sd, " ") or contains($sd, "-")
+ or contains($sd, "/"))
+ then tokenize ($sd, "[ \-/:\.]+")
+ else ()
+ let $sdtok :=
+ if ((count(index-of($format-tokens, "z")) != 0) and (not(contains($sdt[last()], "+"))))
+ then ($sdt[position() != last()], concat("-", $sdt[position() = last()]))
+ else $sdt
+ let $sd-tokens :=
+ for $a in $sdtok
+ return
+ if (matches($a, "[0-9][0-9][A-Za-z]+[0-9][0-9]+"))
+ then (let $ydtoken := tokenize(replace($a, "[A-Za-z]", " "), " ")
+ let $ft := $ydtoken[position()=1]
+ let $lt := $ydtoken[last()]
+ let $mtoken := replace($a, "[0-9]", "") return ($ft, $mtoken, $lt))
+ else $a
+ let $timeFormat := tokenize($format, "[ :\.\-]")[position()>1]
+ let $dateFormat := tokenize($format, "[ :\.\-]")[position()=1]
+ return
+ if (count($sd-tokens)>1)
+ then
+ (:Date:)
+ let $year :=
+ if (count(index-of($format-tokens, "F")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "D")) != 0) then concat("19", string($sd-tokens[position() = 3]))
+ else
+
+ if (count(index-of($format-tokens, "Y")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "Y")]) else
+
+ if (count(index-of($format-tokens, "y")) != 0)
+ then
+ if(count(index-of($format-tokens, "C")) !=0)
+ then concat(string(number(string($sd-tokens[position() = index-of($format-tokens, "C")]))-1), string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+ else
+ concat("19", string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+ else "YND"
+
+ let $month :=
+ if (count(index-of($format-tokens, "h")) != 0)
+ then string($monthDict//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "h")]]/@value) else
+
+ if (count(index-of($format-tokens, "b")) != 0)
+ then string($monthDict//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "b")]]/@value)
+ else
+
+ if (count(index-of($format-tokens, "B")) != 0)
+ then string($monthDict//month[lower-case(@name) =
+ lower-case($sd-tokens[position() = index-of($format-tokens, "B")])]/@value)
+
+ else
+
+ if (count(index-of($format-tokens, "F")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "m")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "m")])
+
+ else "MND"
+
+ let $day :=
+ if (count(index-of($format-tokens, "F")) != 0)
+ then string($sd-tokens[position() = 3]) else
+
+ if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "d")) != 0)
+ then $sd-tokens[position() = index-of($format-tokens, "d")] else
+
+ if (count(index-of($format-tokens, "e")) != 0)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+ else "DND"
+
+ (:Time:)
+ let $hours :=
+ if (count(index-of($format-tokens, "T")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),1,2)
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "H")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "H")]) else
+
+ if (count(index-of($format-tokens, "k")) != 0)
+ then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "k")]))=1)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "k")]))
+ else string($sd-tokens[position() = index-of($format-tokens, "k")])
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then
+ if(lower-case(string($sd-tokens[position() = 4]))="am")
+ then string($sd-tokens[position() = 1])
+ else if(lower-case(string($sd-tokens[position() = 4]))="pm")
+ then if(string($sd-tokens[position() = 1])="12") then 12
+ else string(number(string($sd-tokens[position() = 1]))+12)
+ else()
+
+ else
+
+ if (count(index-of($format-tokens, "I")) != 0)
+ then
+ if(count(index-of($format-tokens, "p")) !=0)
+ then if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="am")
+ then string($sd-tokens[position() = index-of($format-tokens, "I")])
+ else if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="pm")
+ then if (string($sd-tokens[position() = index-of($format-tokens, "I")])="12")
+ then "12"
+ else string(number(string($sd-tokens[position() = index-of($format-tokens, "I")]))+12)
+ else()
+ else()
+
+ else
+ if (count(index-of($format-tokens, "l")) != 0)
+ then
+ if(count(index-of($format-tokens, "p")) !=0)
+ then if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="am")
+ then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "l")]))=1)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "l")]))
+ else string($sd-tokens[position() = index-of($format-tokens, "l")])
+
+ else if (lower-case(string($sd-tokens[position() = index-of($format-tokens, "p")]))="pm")
+ then if (string($sd-tokens[position() = index-of($format-tokens, "l")])="12")
+ then "12"
+ else string(number(string($sd-tokens[position() = index-of($format-tokens, "l")]))+12)
+ else()
+
+ else ()
+
+
+ else "HND"
+
+ let $minutes :=
+
+ if (count(index-of($format-tokens, "T")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),4,2)
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "M")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "M")])
+
+ else "MND"
+
+ let $seconds :=
+ if (count(index-of($format-tokens, "T")) != 0)
+ then string($sd-tokens[position() = 3]) else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),7,2)
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then string($sd-tokens[position() = 3])
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then "00"
+ else
+
+ if (count(index-of($format-tokens, "S")) != 0)
+ then $sd-tokens[position() = index-of($format-tokens, "S")] else
+
+ if (count(index-of($format-tokens, "e")) != 0)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+ else "SND"
+
+ let $result :=
+
+ if (count(index-of($format-tokens, "Z")) != 0)
+ then
+ if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() = index-of($format-tokens, "Z")]]),1,1)='+')
+ then let $complement :=
+ if (number($minutes)+number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)) > 59)
+ then 1
+ else 0
+
+ let $rhours :=
+ if (string-length(string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+
+ let $rminutes :=
+ if (string-length(string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+
+
+ return concat($year, "-", $month, "-", $day, "T", $rhours, ":", $rminutes, ":", $seconds)
+ else
+
+ if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),1,1)='-')
+ then
+ let $complement :=
+ if (number($minutes)-number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)) < 0)
+ then -1
+ else 0
+
+ let $rhours :=
+ if( ((number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24) >= 0 )
+ then
+ if (string-length(string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else
+ if (string-length(string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (24 + number($complement) + -(number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)))) mod 24))
+
+ let $rminutes :=
+ if( ((number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60) >= 0 )
+ then
+ if (string-length(string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else
+ if (string-length(string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)))) mod 60)) = 2)
+ then (string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)))) mod 60))
+ else concat("0",
+ string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)))) mod 60))
+
+ return concat($year, "-", $month, "-", $day, "T", $rhours, ":", $rminutes, ":", $seconds)
+ else ()
+ else
+
+
+ if (count(index-of($format-tokens, "z")) != 0)
+ then if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='+')
+ then let $complement :=
+ if (number($minutes)+number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) > 59) then 1
+ else 0
+
+ let $rhours :=
+ if (string-length(string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+
+ let $rminutes :=
+ if (string-length(string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+
+
+ return concat($year, "-", $month, "-", $day, "T", $rhours, ":", $rminutes, ":", $seconds)
+ else
+
+ if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='-')
+ then
+ let $complement :=
+ if (number($minutes)-number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) < 0) then -1
+ else 0
+
+ let $rhours :=
+ if( ((number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24) >= 0 )
+ then
+ if (string-length(string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else
+ if (string-length(string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (24 + number($complement) + -(number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 24))
+
+ let $rminutes :=
+ if( ((number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60) >= 0 )
+ then
+ if (string-length(string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else
+ if (string-length(string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60)) = 2)
+ then (string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60))
+ else concat("0",
+ string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 60))
+
+ return concat($year, "-", $month, "-", $day, "T", $rhours, ":", $rminutes, ":", $seconds)
+ else ()
+ else
+ concat($year, "-", $month, "-", $day, "T", $hours, ":", $minutes, ":", $seconds)
+
+ return
+
+ if (matches(string($result),"[0-9]+-((0[1-9])|(1[0-2]))-((0[1-9])|([12][0-9])|(3[01]))T(([01][0-9])|(2[0-3])):[0-5][0-9]:[0-5][0-9]"))
+ then $result
+ else
+ (error(QName('http://www.zorba-xquery.com/modules/data-cleaning/normalization',
+ 'err:notsupported'),data(concat($result, " - ", concat("hours: ", $hours), concat(" minutes: ", $minutes), concat(" seconds:", $seconds)))))
+
+ else()
+:)""
+};
+
+(:~
+ : Uses an address normalization Web service to convert a postal address given as input into a
+ : cannonical representation format.
+ :
+ : <br/>
+ : Example usage : <pre> normalize-address ( ( 'Marques de Pombal' , 'Lisboa' ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( 'Portugal' , 'Lisbon' , 'praca Marques de Pombal' ) </pre>
+ :
+ : @param $addr A sequence of strings encoding an address, where each string in the sequence corresponds to a different component (e.g., street, city, country, etc.) of the address.
+ : @return A sequence of strings with the address encoded in a cannonical format, where each string in the sequence corresponds to a different component (e.g., street, city, country, etc.) of the address.
+ : @example test/Queries/data-cleaning/normalization/normalize-address.xq
+ :)
+declare %ann:nondeterministic function normalization:normalize-address ( $addr as xs:string* ) as xs:string* {
+
+ let $id := ""
+ let $url := "http://where.yahooapis.com/geocode?q=";
+ let $q2 := string-join(for $i in $addr return translate($i," ","+"),",")
+ let $call := concat($url,$q2,"&appid=",$id)
+ let $doc := http:get-node($call)[2]
+ return distinct-values( (if (string-length($doc//xs:string(*:country)) > 0) then ($doc//xs:string(*:country)) else (),
+ if (string-length($doc//xs:string(*:state)) > 0) then ($doc//xs:string(*:state)) else (),
+ if (string-length($doc//xs:string(*:county)) > 0) then ($doc//xs:string(*:county)) else (),
+ if (string-length($doc//xs:string(*:city)) > 0) then ($doc//xs:string(*:city)) else (),
+ if (string-length($doc//xs:string(*:neighborhood)) > 0) then ($doc//xs:string(*:neighborhood)) else (),
+ if (string-length($doc//xs:string(*:street)) > 0) then ($doc//xs:string(*:street)) else (),
+ if (string-length($doc//xs:string(*:house)) > 0) then ($doc//xs:string(*:house)) else () ) )
+};
+
+(:~
+ : Uses an phone number normalization Web service to convert a phone number given as input into a
+ : cannonical representation.
+ :
+ : @param $phone A strings encoding a phone number.
+ : @return A strings with the phone number encoded in a cannonical format.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function normalization:normalize-phone ( $addr as xs:string* ) as xs:string* {
+ ()
+};
+
+(:~
+ : Internal auxiliary function that returns an XML representation for a dictionary that contains the
+ : time-shift value associated to different time-zone abbreviations.
+ :)
+declare %private function normalization:timeZone-dictionary() as element(){
+ let $result :=
+ <dictionary>
+ <timeZone name="A" value="+0100"/>
+ <timeZone name="ADT" value="-0300"/>
+ <timeZone name="AFT" value="+0430"/>
+ <timeZone name="AKDT" value="-0800"/>
+ <timeZone name="AKST" value="-0900"/>
+ <timeZone name="ALMT" value="+0600"/>
+ <timeZone name="AMST" value="+0500"/>
+ <!--<timeZone name="AMST" value="-0300"/>-->
+ <timeZone name="AMT" value="+0400"/>
+ <!--<timeZone name="AMT" value="-0400"/>-->
+ <timeZone name="ANAST" value="+1200"/>
+ <timeZone name="ANAT" value="+1200"/>
+ <timeZone name="AQTT" value="+0500"/>
+ <timeZone name="ART" value="-0300"/>
+ <timeZone name="AST" value="-0400"/>
+ <timeZone name="AZOST" value="+0000"/>
+ <timeZone name="AZOT" value="-0100"/>
+ <timeZone name="AZST" value="+0500"/>
+ <timeZone name="AZT" value="+0400"/>
+ <timeZone name="B" value="+0200"/>
+ <timeZone name="BNT" value="+0800"/>
+ <timeZone name="BOT" value="-0400"/>
+ <timeZone name="BRST" value="-0200"/>
+ <timeZone name="BRT" value="-0300"/>
+ <!--<timeZone name="BST" value="+0600"/>-->
+ <timeZone name="BST" value="+0100"/>
+ <timeZone name="BTT" value="+0600"/>
+ <timeZone name="C" value="+0300"/>
+ <timeZone name="CAST" value="+0800"/>
+ <timeZone name="CAT" value="+0200"/>
+ <timeZone name="CCT" value="+0630"/>
+ <!--<timeZone name="CDT" value="+1030"/>-->
+ <!--<timeZone name="CDT" value="-0400"/>-->
+ <timeZone name="CDT" value="-0500"/>
+ <timeZone name="CEST" value="+0200"/>
+ <timeZone name="CET" value="+0100"/>
+ <timeZone name="CHADT" value="+1345"/>
+ <timeZone name="CHAST" value="+1245"/>
+ <timeZone name="CKT" value="-1000"/>
+ <timeZone name="CLST" value="-0300"/>
+ <timeZone name="CLT" value="-0400"/>
+ <timeZone name="COT" value="-0500"/>
+ <!--<timeZone name="CST" value="+0800"/>-->
+ <!--<timeZone name="CST" value="+0930"/>-->
+ <!--<timeZone name="CST" value="-0600"/>-->
+ <!--<timeZone name="CST" value="-0500"/>-->
+ <timeZone name="CST" value="-0600"/>
+ <timeZone name="CVT" value="-0100"/>
+ <timeZone name="CXT" value="+0700"/>
+ <timeZone name="ChST" value="+1000"/>
+ <timeZone name="D" value="+0400"/>
+ <timeZone name="DAVT" value="+0700"/>
+ <timeZone name="E" value=""/>
+ <timeZone name="EASST" value="-0500"/>
+ <timeZone name="EAST" value="-0600"/>
+ <timeZone name="EAT" value="+0300"/>
+ <timeZone name="ECT" value="-0500"/>
+ <!--<timeZone name="EDT" value="+1100"/>-->
+ <timeZone name="EDT" value="-0400"/>
+ <timeZone name="EEST" value="+0300"/>
+ <timeZone name="EET" value="+0200"/>
+ <timeZone name="EGST" value="+0000"/>
+ <timeZone name="EGT" value="-0100"/>
+ <timeZone name="EST" value="+1000"/>
+ <!--<timeZone name="EST" value="-0500"/>-->
+ <timeZone name="ET" value="-0500"/>
+ <timeZone name="F" value="+0600"/>
+ <timeZone name="FJST" value="+1300"/>
+ <timeZone name="FJT" value="+1200"/>
+ <timeZone name="FKST" value="-0300"/>
+ <timeZone name="FKT" value="-0400"/>
+ <timeZone name="FNT" value="-0200"/>
+ <timeZone name="G" value="+0700"/>
+ <timeZone name="GALT" value="-0600"/>
+ <timeZone name="GAMT" value="-0900"/>
+ <timeZone name="GET" value="+0400"/>
+ <timeZone name="GFT" value="-0300"/>
+ <timeZone name="GILT" value="+1200"/>
+ <timeZone name="GMT" value="+0000"/>
+ <timeZone name="GST" value="+0400"/>
+ <timeZone name="GYT" value="-0400"/>
+ <timeZone name="H" value="+0800"/>
+ <timeZone name="HAA" value="-0300"/>
+ <timeZone name="HAC" value="-0500"/>
+ <timeZone name="HADT" value="-0900"/>
+ <timeZone name="HAE" value="-0400"/>
+ <timeZone name="HAP" value="-0700"/>
+ <timeZone name="HAR" value="-0600"/>
+ <timeZone name="HAST" value="-1000"/>
+ <timeZone name="HAT" value="-0230"/>
+ <timeZone name="HAY" value="-0800"/>
+ <timeZone name="HKT" value="+0800"/>
+ <timeZone name="HLV" value="-0430"/>
+ <timeZone name="HNA" value="-0400"/>
+ <timeZone name="HNC" value="-0600"/>
+ <timeZone name="HNE" value="-0500"/>
+ <timeZone name="HNP" value="-0800"/>
+ <timeZone name="HNR" value="-0700"/>
+ <timeZone name="HNT" value="-0330"/>
+ <timeZone name="I" value="+0900"/>
+ <timeZone name="ICT" value="+0700"/>
+ <timeZone name="IDT" value="+0300"/>
+ <timeZone name="IOT" value="+0600"/>
+ <timeZone name="IRDT" value="+0430"/>
+ <timeZone name="IRKST" value="+0900"/>
+ <timeZone name="IRKT" value="+0800"/>
+ <timeZone name="IRST" value="+0330"/>
+ <!--<timeZone name="IST" value="+0200"/>-->
+ <timeZone name="IST" value="+0530"/>
+ <!--<timeZone name="IST" value="+0100"/>-->
+ <timeZone name="JST" value="+0900"/>
+ <timeZone name="K" value="+1000"/>
+ <timeZone name="KGT" value="+0600"/>
+ <timeZone name="KRAST" value="+0800"/>
+ <timeZone name="KRAT" value="+0700"/>
+ <timeZone name="KST" value="+0900"/>
+ <timeZone name="KUYT" value="+0400"/>
+ <timeZone name="L" value="+1100"/>
+ <timeZone name="LHDT" value="+1100"/>
+ <timeZone name="LHST" value="+10:30"/>
+ <timeZone name="LINT" value="+1400"/>
+ <timeZone name="M" value="+1200"/>
+ <timeZone name="MAGST" value="+1200"/>
+ <timeZone name="MAGT" value="+1100"/>
+ <timeZone name="MART" value="-0930"/>
+ <timeZone name="MAWT" value="+0500"/>
+ <timeZone name="MDT" value="-0600"/>
+ <timeZone name="MHT" value="+1200"/>
+ <timeZone name="MMT" value="+0630"/>
+ <timeZone name="MSD" value="+0400"/>
+ <timeZone name="MSK" value="+0300"/>
+ <timeZone name="MST" value="-0700"/>
+ <timeZone name="MUT" value="+0400"/>
+ <timeZone name="MVT" value="+0500"/>
+ <timeZone name="MYT" value="+0800"/>
+ <timeZone name="N" value="-0100"/>
+ <timeZone name="NCT" value="+1100"/>
+ <timeZone name="NDT" value="-0230"/>
+ <timeZone name="NFT" value="+1130"/>
+ <timeZone name="NOVST" value="+0700"/>
+ <timeZone name="NOVT" value="+0600"/>
+ <timeZone name="NPT" value="+0545"/>
+ <timeZone name="NST" value="-0330"/>
+ <timeZone name="NUT" value="-1100"/>
+ <timeZone name="NZDT" value="+1300"/>
+ <timeZone name="NZST" value="+1200"/>
+ <timeZone name="O" value="-0200"/>
+ <timeZone name="OMSST" value="+0700"/>
+ <timeZone name="OMST" value="+0600"/>
+ <timeZone name="P" value="-0300"/>
+ <timeZone name="PDT" value="-0700"/>
+ <timeZone name="PET" value="-0500"/>
+ <timeZone name="PETST" value="+1200"/>
+ <timeZone name="PETT" value="+1200"/>
+ <timeZone name="PGT" value="+1000"/>
+ <timeZone name="PHOT" value="+1300"/>
+ <timeZone name="PHT" value="+0800"/>
+ <timeZone name="PKT" value="+0500"/>
+ <timeZone name="PMDT" value="-0200"/>
+ <timeZone name="PMST" value="-0300"/>
+ <timeZone name="PONT" value="+1100"/>
+ <timeZone name="PST" value="-0800"/>
+ <timeZone name="PT" value="-0800"/>
+ <timeZone name="PWT" value="+0900"/>
+ <timeZone name="PYST" value="-0300"/>
+ <timeZone name="PYT" value="-0400"/>
+ <timeZone name="Q" value="-0400"/>
+ <timeZone name="R" value="-0500"/>
+ <timeZone name="RET" value="+0400"/>
+ <timeZone name="S" value="-0600"/>
+ <timeZone name="SAMT" value="+0400"/>
+ <timeZone name="SAST" value="+0200"/>
+ <timeZone name="SBT" value="+1100"/>
+ <timeZone name="SCT" value="+0400"/>
+ <timeZone name="SGT" value="+0800"/>
+ <timeZone name="SRT" value="-0300"/>
+ <timeZone name="SST" value="-1100"/>
+ <timeZone name="T" value="-0700"/>
+ <timeZone name="TAHT" value="-1000"/>
+ <timeZone name="TFT" value="+0500"/>
+ <timeZone name="TJT" value="+0500"/>
+ <timeZone name="TKT" value="-1000"/>
+ <timeZone name="TLT" value="+0900"/>
+ <timeZone name="TMT" value="+0500"/>
+ <timeZone name="TVT" value="+1200"/>
+ <timeZone name="U" value="-0800"/>
+ <timeZone name="ULAT" value="+0800"/>
+ <timeZone name="UTC" value="+0000"/>
+ <timeZone name="UYST" value="-0200"/>
+ <timeZone name="UYT" value="-0300"/>
+ <timeZone name="UZT" value="+0500"/>
+ <timeZone name="V" value="-0900"/>
+ <timeZone name="VET" value="-0430"/>
+ <timeZone name="VLAST" value="+1100"/>
+ <timeZone name="VLAT" value="+1000"/>
+ <timeZone name="VUT" value="+1100"/>
+ <timeZone name="W" value="-1000"/>
+ <timeZone name="WAST" value="+0200"/>
+ <timeZone name="WAT" value="+0100"/>
+ <timeZone name="WDT" value="+0900"/>
+ <timeZone name="WEST" value="+0100"/>
+ <timeZone name="WET" value="+0000"/>
+ <timeZone name="WFT" value="+1200"/>
+ <timeZone name="WGST" value="-0200"/>
+ <timeZone name="WGT" value="-0300"/>
+ <timeZone name="WIB" value="+0700"/>
+ <timeZone name="WIT" value="+0900"/>
+ <timeZone name="WITA" value="+0800"/>
+ <!--<timeZone name="WST" value="+0100"/>-->
+ <!--<timeZone name="WST" value="-1100"/>-->
+ <timeZone name="WST" value="+0800"/>
+ <timeZone name="WT" value="+0000"/>
+ <timeZone name="X" value="-1100"/>
+ <timeZone name="Y" value="-1200"/>
+ <timeZone name="YAKST" value="+1000"/>
+ <timeZone name="YAKT" value="+0900"/>
+ <timeZone name="YAPT" value="+1000"/>
+ <timeZone name="YEKST" value="+0600"/>
+ <timeZone name="YEKY" value="+0500"/>
+ <timeZone name="Z" value="+0000"/>
+ </dictionary>
+return $result
+};
+
+(:~
+ : Internal auxiliary function that returns an XML representation for a dictionary that contains a
+ : numeric value associated to different month name abbreviations.
+ :)
+declare %private function normalization:month-dictionary() as node(){
+let $dictionary :=
+<dictionary>
+ <month name="January" value="01">
+ <abrv>Jan</abrv>
+ <abrv>jan</abrv>
+ <abrv>JAN</abrv>
+ </month>
+ <month name="February" value="02">
+ <abrv>Feb</abrv>
+ <abrv>feb</abrv>
+ <abrv>FEB</abrv>
+ </month>
+ <month name="March" value="03">
+ <abrv>Mar</abrv>
+ <abrv>mar</abrv>
+ <abrv>MAR</abrv>
+ </month>
+ <month name="April" value="04">
+ <abrv>Apr</abrv>
+ <abrv>apr</abrv>
+ <abrv>APR</abrv>
+ </month>
+ <month name="May" value="05">
+ <abrv>MAY</abrv>
+ <abrv>may</abrv>
+ </month>
+ <month name="June" value="06">
+ <abrv>Jun</abrv>
+ <abrv>jun</abrv>
+ <abrv>JUN</abrv>
+ </month>
+ <month name="July" value="07">
+ <abrv>Jul</abrv>
+ <abrv>jul</abrv>
+ <abrv>JUL</abrv>
+ </month>
+ <month name="August" value="08">
+ <abrv>aug</abrv>
+ <abrv>Aug</abrv>
+ <abrv>AUG</abrv>
+ </month>
+ <month name="September" value="09">
+ <abrv>sep</abrv>
+ <abrv>Sep</abrv>
+ <abrv>SEP</abrv>
+ </month>
+ <month name="October" value="10">
+ <abrv>oct</abrv>
+ <abrv>OCT</abrv>
+ <abrv>Oct</abrv>
+ </month>
+ <month name="November" value="11">
+ <abrv>nov</abrv>
+ <abrv>Nov</abrv>
+ <abrv>NOV</abrv>
+ </month>
+ <month name="December" value="12">
+ <abrv>dec</abrv>
+ <abrv>Dec</abrv>
+ <abrv>DEC</abrv>
+ </month>
+</dictionary>
+return $dictionary
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,117 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides phonetic string similarity functions, comparing strings with basis on how they sound.
+ :
+ : These metrics are particularly effective in matching names, since names are often spelled in different
+ : ways that sound the same.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the Soundex key for a given string.
+ :
+ : <br/>
+ : Example usage : <pre> soundex-key("Robert") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> "R163" </pre>
+ :
+ : @param $s1 The string.
+ : @return The Soundex key for the given input string.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq
+ :)
+declare function simp:soundex-key ( $s1 as xs:string ) as xs:string {
+ let $group1 := replace(upper-case(substring($s1,2)),"[BFPV]","1")
+ let $groups := replace(replace(replace(replace(replace(replace($group1,"[CGJKQSXZ]","2"),"[DT]","3"),"L","4"),"[MN]","5"),"R","6"),"[^1-6]","")
+ let $merge := replace($groups,"([1-6])\1","$1")
+ let $result := concat(upper-case(substring($s1,1,1)), $merge)
+ return if (string-length($result) > 4 and matches($result,"([1-6])\1"))
+ then (simp:soundex-key($result))
+ else (substring(concat($result,"0000"),1,4))
+};
+
+(:~
+ : Checks if two strings have the same Soundex key.
+ :
+ : <br/>
+ : Example usage : <pre> soundex( "Robert" , "Rupert" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> true </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return Returns true if both strings have the same Soundex key and false otherwise.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq
+ :)
+declare function simp:soundex ( $s1 as xs:string, $s2 as xs:string ) as xs:boolean {
+ simp:soundex-key($s1) = simp:soundex-key($s2)
+};
+
+(:~
+ : Returns the Metaphone key for a given string.
+ : The Metaphone algorithm produces variable length keys as its output, as opposed to Soundex's fixed-length keys.
+ :
+ : <br/>
+ : Example usage : <pre> metaphone-key("ALEKSANDER") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> "ALKSNTR" </pre>
+ :
+ : @param $s1 The string.
+ : @return The Metaphone key for the given input string.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq
+ :)
+declare function simp:metaphone-key ( $s1 as xs:string ) as xs:string {
+ let $aux1 := replace(upper-case($s1),"([^C])\1","$1")
+ let $aux2 := if (matches($aux1,"$(([KGP]N)|([A]E)|([W]R))")) then (substring($aux1,2,string-length($aux1))) else ($aux1)
+ let $aux3 := replace(replace($aux2,"MB","M"),"B$","")
+ let $aux4 := replace(replace(replace(replace(replace($aux3,"CIA","XIA"),"SCH","SKH"),"CH","XH"),"C([IEY])","S$1"),"C","K")
+ let $aux5 := replace(replace($aux4,"DG([EYI])","JG$1"),"D","T")
+ let $aux6 := replace(replace($aux5,"GH([^AEIOU])","H$1"),"G(N(ED)?)^","$1")
+ let $aux7 := replace(replace(replace($aux6,"([^G]?)G([IEY])","$1J$2"),"([^G]?)G","$1K"),"GG","G")
+ let $aux8 := replace(replace(replace(replace($aux7,"([AEIOU])H([^AEIOU])","$1$2"),"CK","K"),"PH","F"),"Q","K")
+ let $aux9 := replace(replace(replace(replace(replace($aux8,"S(H|(IO)|(IA))","X$1"),"T((IO)|(IA))","X$1"),"TH","0"),"TCH","CH"),"V","F")
+ let $aux10 := replace(replace(replace(replace(replace(replace($aux9,"$WH","W"),"W([^AEIOU])","$1"),"$X","S"),"X","KS"),"Y([^AEIOU])","$1"),"Z","S")
+ return concat(substring($aux10,1,1) , replace(substring($aux10,2,string-length($aux10)) , "[AEIOU]", ""))
+};
+
+(:~
+ : Checks if two strings have the same Metaphone key.
+ :
+ : <br/>
+ : Example usage : <pre> metaphone("ALEKSANDER", "ALEXANDRE") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> true </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return Returns true if both strings have the same Metaphone key and false otherwise.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq
+ :)
+declare function simp:metaphone ( $s1 as xs:string, $s2 as xs:string ) as xs:boolean {
+ simp:metaphone-key($s1) = simp:metaphone-key($s2)
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,150 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides similarity functions for comparing sets of XML
+ : nodes (e.g., sets of XML elements, attributes or atomic values).
+ :
+ : These functions are particularly useful for matching near duplicate sets of XML nodes.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the union between two sets, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> deep-union ( ( "a", "b", "c") , ( "a", "a", <d/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a", "b", "c", <d/> ) </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The union of both sets.
+ : @example test/Queries/data-cleaning/set-similarity/deep-union.xq
+ :)
+declare function set:deep-union ( $s1 , $s2 ) {
+ let $s := ( $s1 , $s2 )
+ for $a at $apos in $s
+ where every $ba in subsequence($s, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ return $a
+};
+
+(:~
+ : Returns the intersection between two sets, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> deep-intersect ( ( "a", "b", "c") , ( "a", "a", <d/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The intersection of both sets.
+ : @example test/Queries/data-cleaning/set-similarity/deep-intersect.xq
+ :)
+declare function set:deep-intersect ( $s1 , $s2 ) {
+ for $a at $apos in $s1
+ let $t1 := every $ba in subsequence($s1, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ let $t2 := some $bb in $s2 satisfies deep-equal($bb,$a)
+ where $t1 and $t2
+ return $a
+};
+
+(:~
+ : Removes exact duplicates from a set, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> distinct ( ( "a", "a", <b/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a", <b/> ) </pre>
+ :
+ : @param $s A set.
+ : @return The set provided as input without the exact duplicates (i.e., returns the distinct nodes from the set provided as input).
+ : @example test/Queries/data-cleaning/set-similarity/distinct.xq
+ :)
+declare function set:distinct ( $s ) {
+ for $a at $apos in $s
+ where every $ba in subsequence($s, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ return $a
+};
+
+(:~
+ : Returns the overlap coefficient between two sets of XML nodes.
+ : The overlap coefficient is defined as the shared information between the input sets
+ : (i.e., the size of the intersection) over the size of the smallest input set.
+ :
+ : <br/>
+ : Example usage : <pre> overlap ( ( "a", "b", <c/> ) , ( "a", "a", "b" ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The overlap coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/overlap.xq
+ :)
+declare function set:overlap ( $s1 , $s2 ) as xs:double {
+ count( set:deep-intersect($s1, $s2) ) div min((count(set:distinct($s1)) , count(set:distinct($s2))))
+};
+
+(:~
+ : Returns the Dice similarity coefficient between two sets of XML nodes.
+ : The Dice coefficient is defined as defined as twice the shared information between the input sets
+ : (i.e., the size of the intersection) over the sum of the cardinalities for the input sets.
+ :
+ : <br/>
+ : Example usage : <pre> dice ( ( "a", "b", <c/> ) , ( "a", "a", "d") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The Dice similarity coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/dice.xq
+ :)
+declare function set:dice ( $s1 , $s2 ) as xs:double {
+ 2 * count( set:deep-intersect($s1,$s2) ) div ( count(set:distinct($s1)) + count(set:distinct($s2)) )
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between two sets of XML nodes.
+ : The Jaccard coefficient is defined as the size of the intersection divided by the size of the
+ : union of the input sets.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard ( ( "a", "b", <c/> ) , ( "a", "a", "d") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.25 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The Jaccard similarity coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/jaccard.xq
+ :)
+declare function set:jaccard ( $s1 , $s2 ) as xs:double {
+ count( set:deep-intersect($s1,$s2) ) div count( set:deep-union($s1,$s2) )
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,249 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides token-based string similarity functions that view strings
+ : as sets or multi-sets of tokens and use set-related properties to compute similarity scores.
+ : The tokens correspond to groups of characters extracted from the strings being compared, such as
+ : individual words or character n-grams.
+ :
+ : These functions are particularly useful for matching near duplicate strings in cases where
+ : typographical conventions often lead to rearrangement of words (e.g., "John Smith" versus "Smith, John").
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation,
+ : although the module requires the trigonometic functions of XQuery 1.1 or a math extension
+ : function such as sqrt($x as numeric) for computing the square root.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+(: In the QizX or Saxon XQuery engines, it is possible to call external functions from the Java math library :)
+(: declare namespace math = "java:java.lang.Math"; :)
+declare namespace math = "http://www.w3.org/2005/xpath-functions/math";;
+
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";;
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the individual character n-grams forming a string.
+ :
+ : <br/>
+ : Example usage : <pre> ngrams("FLWOR", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("_F" , "FL" , "LW" , "WO" , "LW" , "WO" , "OR" , "R_") </pre>
+ :
+ : @param $s The input string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The sequence of strings with the extracted n-grams.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq
+ :)
+declare function simt:ngrams ( $s as xs:string, $n as xs:integer ) as xs:string* {
+ let $pad := '_'
+ return
+ ( for $a in 1 to $n
+ let $apad := string-join( for $aux in $a + 1 to $n return $pad , '' )
+ return concat( $apad , replace(substring($s,1,$a) , "_", "\\_") ) ,
+
+ for $b in $n + 2 to string-length($s) return replace(substring($s,$b - $n, $n), "_", "\\_") ,
+
+ for $c in string-length($s) - (if ($n = 1) then (-1) else ($n)) - 1 to string-length($s)
+ let $cpad := string-join( for $aux in string-length($s) - $c + 2 to $n return $pad , '' )
+ return concat(replace(substring($s, $c, $n), "_", "\\_"), $cpad )
+ )
+};
+
+(:~
+ : Auxiliary function for computing the cosine similarity coefficient between strings,
+ : using stringdescriptors based on sets of character n-grams or sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> cosine( ("aa","bb") , ("bb","aa")) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $desc1 The descriptor for the first string.
+ : @param $desc2 The descriptor for the second string.
+ : @return The cosine similarity coefficient between the descriptors for the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine.xq
+ :)
+declare function simt:cosine ( $desc1 as xs:string*, $desc2 as xs:string* ) as xs:double {
+ let $vocab := distinct-values( ($desc1, $desc2) )
+ let $freq1 := for $v1 in $vocab return count($desc1[.=$v1])
+ let $freq2 := for $v2 in $vocab return count($desc2[.=$v2])
+ let $freq1pow := for $f1 in $freq1 return $f1 * $f1
+ let $freq2pow := for $f2 in $freq2 return $f2 * $f2
+ let $mult := for $freq at $pos in $freq1 return $freq * $freq2[$pos]
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the Dice similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> dice-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4615384615384616 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The Dice similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq
+ :)
+declare function simt:dice-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:dice(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the overlap similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> overlap-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The overlap similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq
+ :)
+declare function simt:overlap-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:overlap(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.3 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The Jaccard similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq
+ :)
+declare function simt:jaccard-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:jaccard(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of character n-grams extracted from two strings.
+ : The n-grams from each string are weighted according to their occurence frequency (i.e., weighted according to
+ : the term-frequency heuristic from Information Retrieval).
+ :
+ : <br/>
+ : Example usage : <pre> cosine-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.2401922307076307 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The cosine similarity coefficient between the sets n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq
+ :)
+declare function simt:cosine-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ let $ngrams1 := simt:ngrams($s1,$n)
+ let $ngrams2 := simt:ngrams($s2,$n)
+ return simt:cosine($ngrams1,$ngrams2)
+};
+
+(:~
+ : Returns the Dice similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> dice-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The Dice similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq
+ :)
+declare function simt:dice-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:dice( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the overlap similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> overlap-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The overlap similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq
+ :)
+declare function simt:overlap-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:overlap( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.25 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The Jaccard similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq
+ :)
+declare function simt:jaccard-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:jaccard( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings. The tokens
+ : from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ :
+ : <br/>
+ : Example usage : <pre> cosine-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.408248290463863 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq
+ :)
+declare function simt:cosine-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $tokens1 := tokenize($s1,$r)
+ let $tokens2 := tokenize($s2,$r)
+ return simt:cosine($tokens1,$tokens2)
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd'
--- src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd 2011-10-19 02:07:27 +0000
@@ -0,0 +1,343 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"; elementFormDefault="qualified" attributeFormDefault="qualified"
+ targetNamespace="http://api.whitepages.com/schema/"; xmlns:wp="http://api.whitepages.com/schema/";>
+<!--
+:: Copyright 2006-2008 The FLWOR Foundation.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+:: http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+::
+-->
+
+ <xs:element name="wp">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:result"/>
+ <xs:element minOccurs="0" ref="wp:errormessages"/>
+ <xs:element minOccurs="0" ref="wp:meta"/>
+ <xs:element minOccurs="0" ref="wp:listings"/>
+ <xs:element minOccurs="0" ref="wp:options"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="result">
+ <xs:complexType>
+ <xs:attribute name="code" use="required" type="wp:responsecode"/>
+ <xs:attribute name="message"/>
+ <xs:attribute name="type" use="required" type="wp:responsetype"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:simpleType name="responsetype">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="success"/>
+ <xs:enumeration value="error"/>
+ </xs:restriction>
+ </xs:simpleType>
+ <xs:simpleType name="responsecode">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="Found Data"/>
+ <xs:enumeration value="No Data Found"/>
+ <xs:enumeration value="Truncated Data"/>
+ <xs:enumeration value="Error"/>
+ <xs:enumeration value="Server Error"/>
+ <xs:enumeration value="Invalid Input"/>
+ <xs:enumeration value="Mismatched Input"/>
+ <xs:enumeration value="Missing Input"/>
+ <xs:enumeration value="Refine Input"/>
+ </xs:restriction>
+ </xs:simpleType>
+ <xs:element name="errormessages">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:message"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="message" type="xs:string"/>
+ <xs:element name="meta">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" ref="wp:linkexpiration"/>
+ <xs:element ref="wp:recordrange"/>
+ <xs:element ref="wp:apiversion"/>
+ <xs:element ref="wp:searchid"/>
+ <xs:element ref="wp:searchlinks"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="linkexpiration" type="xs:date"/>
+ <xs:element name="recordrange">
+ <xs:complexType>
+ <xs:attribute name="lastrecord" use="required" type="xs:integer"/>
+ <xs:attribute name="firstrecord" use="required" type="xs:integer"/>
+ <xs:attribute name="totalavailable" use="required" type="xs:integer"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="apiversion" type="xs:string"/>
+ <xs:element name="searchid" type="xs:string"/>
+ <xs:element name="searchlinks">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:link"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="listings">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:listing"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="listing">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:people"/>
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:business"/>
+ <xs:element minOccurs="0" ref="wp:displayname"/>
+ <xs:element minOccurs="0" ref="wp:tagline"/>
+ <xs:element minOccurs="0" ref="wp:phonenumbers"/>
+ <xs:element minOccurs="0" ref="wp:address"/>
+ <xs:element minOccurs="0" ref="wp:geodata"/>
+ <xs:element minOccurs="0" ref="wp:listingmeta"/>
+ </xs:sequence>
+ <xs:attribute name="sponsored" type="xs:boolean"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="people">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:person"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="person" type="wp:personType"/>
+ <xs:complexType name="personType">
+ <xs:sequence>
+ <xs:element ref="wp:firstname"/>
+ <xs:element minOccurs="0" ref="wp:middlename"/>
+ <xs:element ref="wp:lastname"/>
+ <xs:element minOccurs="0" ref="wp:suffix"/>
+ </xs:sequence>
+ <xs:attribute name="rank" use="required" type="wp:rank"/>
+ </xs:complexType>
+
+ <xs:simpleType name="rank">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="primary"/>
+ <xs:enumeration value="secondary"/>
+ <xs:enumeration value="tertiary"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:element name="firstname" type="xs:string"/>
+ <xs:element name="middlename" type="xs:string"/>
+ <xs:element name="lastname" type="xs:string"/>
+ <xs:element name="suffix" type="xs:string"/>
+ <xs:element name="business">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:businessname"/>
+ </xs:sequence>
+ <xs:attribute name="rank" use="required" type="wp:rank"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="businessname" type="xs:string"/>
+ <xs:element name="displayname" type="xs:string"/>
+ <xs:element name="tagline" type="xs:string"/>
+ <xs:element name="phonenumbers">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:phone"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="phone">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:fullphone"/>
+ <xs:element minOccurs="0" ref="wp:areacode"/>
+ <xs:element ref="wp:exchange"/>
+ <xs:element ref="wp:linenumber"/>
+ <xs:element minOccurs="0" ref="wp:carrier"/>
+ </xs:sequence>
+ <xs:attribute name="rank" use="required" type="wp:rank"/>
+ <xs:attribute name="type" use="required" type="wp:listingtype"/>
+ <xs:attribute name="carrier_only" type="xs:boolean"/>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:simpleType name="listingtype">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="work"/>
+ <xs:enumeration value="home"/>
+ <xs:enumeration value="business"/>
+ <xs:enumeration value="government"/>
+ <xs:enumeration value="mobile"/>
+ <xs:enumeration value="landline"/>
+ <xs:enumeration value="pager"/>
+ <xs:enumeration value="satellite"/>
+ <xs:enumeration value="unknown"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:element name="fullphone" type="xs:string"/>
+ <xs:element name="areacode" type="xs:string"/>
+ <xs:element name="exchange" type="xs:string"/>
+ <xs:element name="linenumber" type="xs:string"/>
+ <xs:element name="carrier" type="xs:string"/>
+ <xs:element name="address">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" ref="wp:fullstreet"/>
+ <xs:element minOccurs="0" ref="wp:house"/>
+ <xs:element minOccurs="0" ref="wp:predir"/>
+ <xs:element minOccurs="0" ref="wp:street"/>
+ <xs:element minOccurs="0" ref="wp:postdir"/>
+ <xs:element minOccurs="0" ref="wp:streettype"/>
+ <xs:element minOccurs="0" ref="wp:aptnumber"/>
+ <xs:element minOccurs="0" ref="wp:apttype"/>
+ <xs:element minOccurs="0" ref="wp:city"/>
+ <xs:element minOccurs="0" ref="wp:state"/>
+ <xs:element minOccurs="0" ref="wp:zip"/>
+ <xs:element minOccurs="0" ref="wp:zip4"/>
+ <xs:element minOccurs="0" ref="wp:country"/>
+ </xs:sequence>
+ <xs:attribute name="deliverable" use="required" type="xs:boolean"/>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:simpleType name="country">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="US"/>
+ <xs:enumeration value="CA"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:element name="fullstreet" type="xs:string"/>
+ <xs:element name="house" type="xs:string"/>
+ <xs:element name="predir" type="xs:string"/>
+ <xs:element name="street" type="xs:string"/>
+ <xs:element name="postdir" type="xs:string"/>
+ <xs:element name="streettype" type="xs:string"/>
+ <xs:element name="aptnumber" type="xs:string"/>
+ <xs:element name="apttype" type="xs:string"/>
+ <xs:element name="city" type="xs:string"/>
+ <xs:element name="state" type="xs:string"/>
+ <xs:element name="zip" type="xs:string"/>
+ <xs:element name="zip4" type="xs:string"/>
+ <xs:element name="country" type="wp:country"/>
+ <xs:element name="geodata">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:geoprecision"/>
+ <xs:element ref="wp:latitude"/>
+ <xs:element ref="wp:longitude"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="geoprecision" type="xs:integer"/>
+ <xs:element name="latitude" type="xs:string"/>
+ <xs:element name="longitude" type="xs:string"/>
+ <xs:element name="previous_locations">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="previous_location" maxOccurs="unbounded" type="wp:locationType"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:complexType name="locationType">
+ <xs:sequence>
+ <xs:element name="city" type="xs:string"/>
+ <xs:element name="state" type="xs:string"/>
+ <xs:element name="year" type="xs:string"/>
+ </xs:sequence>
+ </xs:complexType>
+ <xs:element name="listingmeta">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" ref="wp:lastvalidated"/>
+ <xs:element minOccurs="0" ref="wp:type"/>
+ <xs:element minOccurs="0" ref="wp:sponsor"/>
+ <xs:element minOccurs="0" ref="wp:recordid"/>
+ <xs:element ref="wp:moreinfolinks"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="lastvalidated" type="xs:string"/>
+ <xs:element name="sponsor" type="xs:string"/>
+ <xs:element name="recordid" type="xs:string"/>
+ <xs:element name="type" type="wp:listingtype"/>
+ <xs:element name="moreinfolinks">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:link"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="options">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:cityoption" minOccurs="0" />
+ <xs:element maxOccurs="unbounded" ref="wp:categoryoption" minOccurs="0" />
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="cityoption">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:refinesearchurl"/>
+ </xs:sequence>
+ <xs:attribute name="city" use="required" type="xs:string"/>
+ <xs:attribute name="country" use="required" type="wp:country"/>
+ <xs:attribute name="state" use="required" type="xs:string"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="categoryoption">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:refinesearchurl"/>
+ </xs:sequence>
+ <xs:attribute name="total" use="required" type="xs:string"/>
+ <xs:attribute name="description" use="required" type="xs:string"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="refinesearchurl" type="xs:anyURI"/>
+ <xs:element name="link">
+ <xs:complexType>
+ <xs:simpleContent>
+ <xs:extension base="xs:anyURI">
+ <xs:attribute name="linktext" use="required" type="xs:string"/>
+ <xs:attribute name="type" use="required" type="wp:linktype"/>
+ </xs:extension>
+ </xs:simpleContent>
+ </xs:complexType>
+ </xs:element>
+ <xs:simpleType name="linktype">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="drivingdirections"/>
+ <xs:enumeration value="findneighbors"/>
+ <xs:enumeration value="homepage"/>
+ <xs:enumeration value="viewdetails"/>
+ <xs:enumeration value="viewmap"/>
+
+ <xs:enumeration value="mapareacode"/>
+
+ <xs:enumeration value="allresults"/>
+ <xs:enumeration value="mapallresults"/>
+ <xs:enumeration value="self"/>
+ <xs:enumeration value="worklistings"/>
+
+ <xs:enumeration value="viewsearchsuggestions"/>
+ </xs:restriction>
+ </xs:simpleType>
+</xs:schema>
\ No newline at end of file
=== added directory 'test'
=== renamed directory 'test' => 'test.moved'
=== added directory 'test/ExpQueryResults'
=== added directory 'test/ExpQueryResults/data-cleaning'
=== added directory 'test/ExpQueryResults/data-cleaning/character-based-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+2
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.8577777777777778
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.5853174603174604
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0
=== added directory 'test/ExpQueryResults/data-cleaning/consolidation'
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<c/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<c/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+eeefff
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+b
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+aaa
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+a A b c AAA d
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<a att1="a1" att2="a2"/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<a att1="a1" att2="a2" att3="a3"/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<a><b/><c/><d/></a>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+aaabbb
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+a b c
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+aaa bbb
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/conversion'
=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+Portugal Lisbon praça Marquês de Pombal
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+4610 Harrison Bend Rd, Loudon, TN, US
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+3362 Walden Ave, Depew, NY, US 222 E 53rd St, Los Angeles, CA, US
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.747887218607434
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+38.725735 -9.15021
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+(520) 824-3160
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/phone-from-user.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/phone-from-user.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/phone-from-user.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+(716) 686-4500
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+Stan Smith
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+Homer V Simpson Homer Simpson Sue M Simpson
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.907838383838384
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/normalization'
=== added file 'test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+Portugal Lisbon Marquês de Pombal
=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-date.xml.res'
=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-dateTime.xml.res'
=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-time.xml.res'
=== added directory 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+ALKSNTR
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+ALKSNTR
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+R163
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+true
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/set-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+a b c<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.4
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+a<b/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.25
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/token-based-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.2401922307076307
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.408248290463863
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.4615384615384616
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.4
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.3
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.25
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+_F FL LW WO LW WO OR R_
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file
=== added directory 'test/Queries'
=== added directory 'test/Queries/data-cleaning'
=== added directory 'test/Queries/data-cleaning/character-based-string-similarity'
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+simc:edit-distance("FLWOR", "FLOWER")
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+simc:jaro-winkler("DWAYNE", "DUANE", 4, 0.1 )
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/jaro.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/jaro.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/jaro.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+simc:jaro("FLWOR Found.", "FLWOR Foundation")
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";;
+
+simc:needleman-wunsch("KAK", "KQRK", 1, 1)
=== added directory 'test/Queries/data-cleaning/consolidation'
=== added file 'test/Queries/data-cleaning/consolidation/least-attributes.xq'
--- test/Queries/data-cleaning/consolidation/least-attributes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-attributes.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-distinct-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-elements.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-elements.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-elements.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-distinct-elements( ( <a><b/></a>, <b><c/></b>, <d/>) )
=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-distinct-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) )
=== added file 'test/Queries/data-cleaning/consolidation/least-elements.xq'
--- test/Queries/data-cleaning/consolidation/least-elements.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-elements.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-elements( ( <a><b/></a>, <b><c/></b>, <d/>) )
=== added file 'test/Queries/data-cleaning/consolidation/least-nodes.xq'
--- test/Queries/data-cleaning/consolidation/least-nodes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-nodes.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) )
=== added file 'test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq'
--- test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" )
=== added file 'test/Queries/data-cleaning/consolidation/least-tokens.xq'
--- test/Queries/data-cleaning/consolidation/least-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-tokens.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-tokens( ( "a b c", "a b", "a"), " +" )
=== added file 'test/Queries/data-cleaning/consolidation/leastfrequent_1.xq'
--- test/Queries/data-cleaning/consolidation/leastfrequent_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/leastfrequent_1.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:least-frequent( ( "a", "a", "b") )
=== added file 'test/Queries/data-cleaning/consolidation/longest_1.xq'
--- test/Queries/data-cleaning/consolidation/longest_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/longest_1.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:longest( ( "a", "aa", "aaa") )
=== added file 'test/Queries/data-cleaning/consolidation/matching_1.xq'
--- test/Queries/data-cleaning/consolidation/matching_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/matching_1.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:matching( ( "a A b", "c AAA d", "e BB f"), "A+" )
=== added file 'test/Queries/data-cleaning/consolidation/most-attributes.xq'
--- test/Queries/data-cleaning/consolidation/most-attributes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-attributes.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-distinct-attributes( ( <a att1="a1" att2="a2" att3="a3"/>, <a att1="a1" att2="a2"><b att2="a2" /></a>, <c/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-elements.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-elements.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-elements.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-distinct-elements( ( <a><b/><c/><d/></a>, <a><b/><b/><c/></a>, <a/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-distinct-nodes( ( <a><b/></a>, <a><a/></a>, <b/>) )
=== added file 'test/Queries/data-cleaning/consolidation/most-elements.xq'
--- test/Queries/data-cleaning/consolidation/most-elements.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-elements.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-elements( ( <a><b/></a>, <a/>, <b/>) )
=== added file 'test/Queries/data-cleaning/consolidation/most-frequent.xq'
--- test/Queries/data-cleaning/consolidation/most-frequent.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-frequent.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-frequent( ( "a", "a", "b") )
=== added file 'test/Queries/data-cleaning/consolidation/most-nodes.xq'
--- test/Queries/data-cleaning/consolidation/most-nodes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-nodes.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-nodes( ( <a><b/></a>, <a/>, <b/>) )
=== added file 'test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq'
--- test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" )
=== added file 'test/Queries/data-cleaning/consolidation/most-tokens.xq'
--- test/Queries/data-cleaning/consolidation/most-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-tokens.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:most-tokens( ( "a b c", "a b", "a"), " +" )
=== added file 'test/Queries/data-cleaning/consolidation/shortest_1.xq'
--- test/Queries/data-cleaning/consolidation/shortest_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/shortest_1.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:shortest( ( "a", "aa", "aaa") )
=== added file 'test/Queries/data-cleaning/consolidation/superstring_1.xq'
--- test/Queries/data-cleaning/consolidation/superstring_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/superstring_1.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";;
+
+con:superstring( ( "aaa bbb ccc", "aaa bbb", "aaa ddd", "eee fff" ) )
=== added directory 'test/Queries/data-cleaning/conversion'
=== added file 'test/Queries/data-cleaning/conversion/address-from-geocode.xq'
--- test/Queries/data-cleaning/conversion/address-from-geocode.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-geocode.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:address-from-geocode ( 38.725735 , -9.15021 )
=== added file 'test/Queries/data-cleaning/conversion/address-from-phone.xq'
--- test/Queries/data-cleaning/conversion/address-from-phone.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-phone.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:address-from-phone ('8654582358')
=== added file 'test/Queries/data-cleaning/conversion/address-from-user.xq'
--- test/Queries/data-cleaning/conversion/address-from-user.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-user.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:address-from-user ('Maria Lurdes')
=== added file 'test/Queries/data-cleaning/conversion/currency-convert.xq'
--- test/Queries/data-cleaning/conversion/currency-convert.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/currency-convert.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:currency-convert ( 1, "USD", "EUR", "2011-01-18" )
=== added file 'test/Queries/data-cleaning/conversion/geocode-from-address.xq'
--- test/Queries/data-cleaning/conversion/geocode-from-address.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/geocode-from-address.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:geocode-from-address ( ("Lisboa", "Portugal") )
=== added file 'test/Queries/data-cleaning/conversion/phone-from-address.xq'
--- test/Queries/data-cleaning/conversion/phone-from-address.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/phone-from-address.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:phone-from-address('5655 E Gaskill Rd, Willcox, AZ, US')
=== added file 'test/Queries/data-cleaning/conversion/phone-from-user.xq'
--- test/Queries/data-cleaning/conversion/phone-from-user.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/phone-from-user.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:phone-from-user ('Maria Lurdes')
=== added file 'test/Queries/data-cleaning/conversion/unit-convert.spec'
--- test/Queries/data-cleaning/conversion/unit-convert.spec 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/unit-convert.spec 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+Error: http://expath.org/ns/error:HC002
=== added file 'test/Queries/data-cleaning/conversion/unit-convert.xq'
--- test/Queries/data-cleaning/conversion/unit-convert.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/unit-convert.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:unit-convert ( 1 , "Distance", "mile", "kilometer" )
=== added file 'test/Queries/data-cleaning/conversion/user-from-address.xq'
--- test/Queries/data-cleaning/conversion/user-from-address.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/user-from-address.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:user-from-address('5655 E Gaskill Rd, Willcox, AZ, US')
=== added file 'test/Queries/data-cleaning/conversion/user-from-phone.xq'
--- test/Queries/data-cleaning/conversion/user-from-phone.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/user-from-phone.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";;
+
+conversion:user-from-phone ('8654582358')
=== added directory 'test/Queries/data-cleaning/hybrid-string-similarity'
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:monge-elkan-jaro-winkler("Comput. Sci. and Eng. Dept., University of California, San Diego", "Department of Computer Scinece, Univ. Calif., San Diego", 4, 0.1)
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-edit-distance("The FLWOR Foundation", "FLWOR Found.", " +", 0 )
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-jaro-winkler("The FLWOR Foundation", "FLWOR Found.", " +", 1, 4, 0.1 )
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-jaro("The FLWOR Foundation", "FLWOR Found.", " +", 1 )
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-metaphone("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +" )
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";;
+
+simh:soft-cosine-tokens-soundex("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +")
=== added directory 'test/Queries/data-cleaning/normalization'
=== added file 'test/Queries/data-cleaning/normalization/normalize-address.xq'
--- test/Queries/data-cleaning/normalization/normalize-address.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/normalize-address.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+normalization:normalize-address ( ( 'Marques de Pombal' , 'Lisboa' ) )
=== added file 'test/Queries/data-cleaning/normalization/to-date.xq'
--- test/Queries/data-cleaning/normalization/to-date.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-date.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+normalization:to-date ( "24OCT2002" , "%d%b%Y" )
=== added file 'test/Queries/data-cleaning/normalization/to-dateTime.spec'
--- test/Queries/data-cleaning/normalization/to-dateTime.spec 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-dateTime.spec 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+Error: http://www.zorba-xquery.com/modules/data-cleaning/normalization:notsupported
=== added file 'test/Queries/data-cleaning/normalization/to-dateTime.xq'
--- test/Queries/data-cleaning/normalization/to-dateTime.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-dateTime.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+normalization:to-dateTime( "24OCT2002 21:22" , "%d%b%Y %H%M" )
=== added file 'test/Queries/data-cleaning/normalization/to-time.spec'
--- test/Queries/data-cleaning/normalization/to-time.spec 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-time.spec 2011-10-19 02:07:27 +0000
@@ -0,0 +1,1 @@
+Error: http://www.zorba-xquery.com/modules/data-cleaning/normalization:notsupported
=== added file 'test/Queries/data-cleaning/normalization/to-time.xq'
--- test/Queries/data-cleaning/normalization/to-time.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-time.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";;
+
+normalization:to-time ( "09 hours 10 minutes" , "%H hours %M minutes" )
=== added directory 'test/Queries/data-cleaning/phonetic-string-similarity'
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+simp:metaphone-key("ALEKSANDER")
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+simp:metaphone-key("ALEKSANDER")
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+simp:soundex-key("Robert")
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";;
+
+simp:soundex( "Robert" , "Rupert" )
=== added directory 'test/Queries/data-cleaning/set-similarity'
=== added file 'test/Queries/data-cleaning/set-similarity/deep-intersect.xq'
--- test/Queries/data-cleaning/set-similarity/deep-intersect.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/deep-intersect.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:deep-intersect ( ( "a", "b", "c") , ( "a", "a", <d/> ) )
=== added file 'test/Queries/data-cleaning/set-similarity/deep-union.xq'
--- test/Queries/data-cleaning/set-similarity/deep-union.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/deep-union.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:deep-union ( ( "a", "b", "c") , ( "a", "a", <d/> ) )
=== added file 'test/Queries/data-cleaning/set-similarity/dice.xq'
--- test/Queries/data-cleaning/set-similarity/dice.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/dice.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:dice ( ( "a", "b", <c/> ) , ( "a", "a", "d") )
=== added file 'test/Queries/data-cleaning/set-similarity/distinct.xq'
--- test/Queries/data-cleaning/set-similarity/distinct.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/distinct.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:distinct (( "a", "a", <b/> ))
=== added file 'test/Queries/data-cleaning/set-similarity/jaccard.xq'
--- test/Queries/data-cleaning/set-similarity/jaccard.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/jaccard.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:jaccard ( ( "a", "b", <c/> ) , ( "a", "a", "d") )
=== added file 'test/Queries/data-cleaning/set-similarity/overlap.xq'
--- test/Queries/data-cleaning/set-similarity/overlap.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/overlap.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";;
+
+set:overlap ( ( "a", "b", <c/> ) , ( "a", "a", "b" ) )
=== added directory 'test/Queries/data-cleaning/token-based-string-similarity'
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:cosine-ngrams("DWAYNE", "DUANE", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:cosine-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:cosine( ("aa","bb") , ("bb","aa"))
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:dice-ngrams("DWAYNE", "DUANE", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:dice-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:jaccard-ngrams("DWAYNE", "DUANE", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:jaccard-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:ngrams("FLWOR", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:overlap-ngrams("DWAYNE", "DUANE", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq 2011-10-19 02:07:27 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";;
+
+simt:overlap-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )