zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #03009
[Merge] lp:~zorba-coders/zorba/fix_bug_871051 into lp:zorba/data-converters-module
Sorin Marian Nasoi has proposed merging lp:~zorba-coders/zorba/fix_bug_871051 into lp:zorba/data-converters-module.
Requested reviews:
Gabriel Petrovay (gabipetrovay)
Sorin Marian Nasoi (sorin.marian.nasoi)
Bruno Martins (bgmartins)
Matthias Brantner (matthias-brantner)
Diogo Simões (diogo-simoes89)
Related bugs:
Bug #871051 in Zorba: "3 data-cleaning tests failing"
https://bugs.launchpad.net/zorba/+bug/871051
For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/fix_bug_871051/+merge/86700
Fix for bug lp:871051.
--
https://code.launchpad.net/~zorba-coders/zorba/fix_bug_871051/+merge/86700
Your team Zorba Coders is subscribed to branch lp:zorba/data-converters-module.
=== added file 'CMakeLists.txt'
--- CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ CMakeLists.txt 2011-12-22 13:29:42 +0000
@@ -0,0 +1,30 @@
+# Copyright 2006-2010 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+
+PROJECT (zorba_data-cleaning_module)
+ENABLE_TESTING ()
+INCLUDE (CTest)
+
+LIST (APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules")
+
+FIND_PACKAGE (Zorba REQUIRED HINTS "${ZORBA_BUILD_DIR}")
+INCLUDE ("${Zorba_USE_FILE}")
+
+ADD_TEST_DIRECTORY("${PROJECT_SOURCE_DIR}/test" "${EXCEPTION_LIST}")
+
+ADD_SUBDIRECTORY("src")
+
+DONE_DECLARING_ZORBA_URIS()
=== renamed file 'CMakeLists.txt' => 'CMakeLists.txt.moved'
=== added directory 'cmake_modules'
=== renamed directory 'cmake_modules' => 'cmake_modules.moved'
=== added file 'cmake_modules/CMakeCompareVersionStrings.cmake'
--- cmake_modules/CMakeCompareVersionStrings.cmake 1970-01-01 00:00:00 +0000
+++ cmake_modules/CMakeCompareVersionStrings.cmake 2011-12-22 13:29:42 +0000
@@ -0,0 +1,84 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Computes the realtionship between two version strings. A version
+# string is a number delineated by '.'s such as 1.3.2 and 0.99.9.1.
+# You can feed version strings with different number of dot versions,
+# and the shorter version number will be padded with zeros: 9.2 <
+# 9.2.1 will actually compare 9.2.0 < 9.2.1.
+#
+# Input: a_in - value, not variable
+# b_in - value, not variable
+# result_out - variable with value:
+# -1 : a_in < b_in
+# 0 : a_in == b_in
+# 1 : a_in > b_in
+#
+# Written by James Bigler.
+MACRO(COMPARE_VERSION_STRINGS a_in b_in result_out)
+ # Since SEPARATE_ARGUMENTS using ' ' as the separation token,
+ # replace '.' with ' ' to allow easy tokenization of the string.
+ STRING(REPLACE "." " " a "${a_in}")
+ STRING(REPLACE "." " " b "${b_in}")
+ SEPARATE_ARGUMENTS(a)
+ SEPARATE_ARGUMENTS(b)
+
+ # Check the size of each list to see if they are equal.
+ LIST(LENGTH a a_length)
+ LIST(LENGTH b b_length)
+
+ # Pad the shorter list with zeros.
+
+ # Note that range needs to be one less than the length as the for
+ # loop is inclusive (silly CMake).
+ IF(a_length LESS b_length)
+ # a is shorter
+ SET(shorter a)
+ MATH(EXPR range "${b_length} - 1")
+ MATH(EXPR pad_range "${b_length} - ${a_length} - 1")
+ ELSE(a_length LESS b_length)
+ # b is shorter
+ SET(shorter b)
+ MATH(EXPR range "${a_length} - 1")
+ MATH(EXPR pad_range "${a_length} - ${b_length} - 1")
+ ENDIF(a_length LESS b_length)
+
+ # PAD out if we need to
+ IF(NOT pad_range LESS 0)
+ FOREACH(pad RANGE ${pad_range})
+ # Since shorter is an alias for b, we need to get to it by by dereferencing shorter.
+ LIST(APPEND ${shorter} 0)
+ ENDFOREACH(pad RANGE ${pad_range})
+ ENDIF(NOT pad_range LESS 0)
+
+ SET(result 0)
+ FOREACH(index RANGE ${range})
+ IF(result EQUAL 0)
+ # Only continue to compare things as long as they are equal
+ LIST(GET a ${index} a_version)
+ LIST(GET b ${index} b_version)
+ # LESS
+ IF(a_version LESS b_version)
+ SET(result -1)
+ ENDIF(a_version LESS b_version)
+ # GREATER
+ IF(a_version GREATER b_version)
+ SET(result 1)
+ ENDIF(a_version GREATER b_version)
+ ENDIF(result EQUAL 0)
+ ENDFOREACH(index)
+
+ # Copy out the return result
+ SET(${result_out} ${result})
+ENDMACRO(COMPARE_VERSION_STRINGS)
=== added directory 'src'
=== renamed directory 'src' => 'src.moved'
=== added file 'src/CMakeLists.txt'
--- src/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/CMakeLists.txt 2011-12-22 13:29:42 +0000
@@ -0,0 +1,20 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+MESSAGE(STATUS "Add com")
+ADD_SUBDIRECTORY(com)
+
+MESSAGE(STATUS "End modules")
=== added directory 'src/com'
=== added file 'src/com/CMakeLists.txt'
--- src/com/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/CMakeLists.txt 2011-12-22 13:29:42 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(zorba-xquery)
=== added directory 'src/com/zorba-xquery'
=== added file 'src/com/zorba-xquery/CMakeLists.txt'
--- src/com/zorba-xquery/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/CMakeLists.txt 2011-12-22 13:29:42 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(www)
=== added directory 'src/com/zorba-xquery/www'
=== added file 'src/com/zorba-xquery/www/CMakeLists.txt'
--- src/com/zorba-xquery/www/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/CMakeLists.txt 2011-12-22 13:29:42 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(modules)
=== added directory 'src/com/zorba-xquery/www/modules'
=== added file 'src/com/zorba-xquery/www/modules/CMakeLists.txt'
--- src/com/zorba-xquery/www/modules/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/CMakeLists.txt 2011-12-22 13:29:42 +0000
@@ -0,0 +1,17 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# all external module libraries are generated in the directory
+# of the corresponding .xq file
+ADD_SUBDIRECTORY(data-cleaning)
=== added directory 'src/com/zorba-xquery/www/modules/data-cleaning'
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt'
--- src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/CMakeLists.txt 2011-12-22 13:29:42 +0000
@@ -0,0 +1,40 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity"
+ VERSION 2.0 FILE "character-based-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/consolidation"
+ VERSION 2.0 FILE "consolidation.xq")
+
+DECLARE_ZORBA_SCHEMA( FILE whitepages_schema.xsd
+ URI "http://api.whitepages.com/schema/")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/conversion"
+ VERSION 2.0 FILE "conversion.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity"
+ VERSION 2.0 FILE "hybrid-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/normalization"
+ VERSION 2.0 FILE "normalization.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity"
+ VERSION 2.0 FILE "phonetic-string-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity"
+ VERSION 2.0 FILE "set-similarity.xq")
+
+DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity"
+ VERSION 2.0 FILE "token-based-string-similarity.xq")
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/character-based-string-similarity.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,177 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides character-based string similarity functions
+ : that view strings as sequences of characters, generally computing a similarity score
+ : that corresponds to the cost of transforming one string into another.
+ :
+ : These functions are particularly useful for matching near duplicate strings
+ : in the presence of typographical errors.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the edit distance between two strings.
+ :
+ : This distance, also refered to as the Levenshtein distance, is defined as the minimum number
+ : of edits needed to transform one string into the other, with the allowable edit operations
+ : being insertion, deletion, or substitution of a single character.
+ :
+ : <br/>
+ : Example usage : <pre> edit-distance("FLWOR", "FLOWER") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 2 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return The edit distance between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq
+ :)
+declare function simc:edit-distance ( $s1 as xs:string, $s2 as xs:string ) as xs:integer {
+ if(string-length($s1) = 0) then string-length($s2) else
+ if(string-length($s2) = 0) then string-length($s1) else
+ min((
+ simc:edit-distance(substring($s1, 2), $s2) + 1 ,
+ simc:edit-distance($s1, substring($s2, 2)) + 1 ,
+ simc:edit-distance(substring($s1, 2), substring($s2, 2)) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then 0 else 1 )
+ ))
+};
+
+(:~
+ : Returns the Jaro similarity coefficient between two strings.
+ :
+ : This similarity coefficient is based on the number of transposed characters and on a
+ : weighted sum of the percentage of matched characters held within the strings. The higher
+ : the Jaro-Winkler value is, the more similar the strings are. The coefficient is
+ : normalized such that 0 equates to no similarity and 1 is an exact match.
+ :
+ : <br/>
+ : Example usage : <pre> jaro("FLWOR Found.", "FLWOR Foundation") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5853174603174603 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return The Jaro similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/jaro.xq
+ :)
+declare function simc:jaro ( $s1 as xs:string, $s2 as xs:string ) as xs:double {
+ let $s := for $i in ($s1,$s2) order by string-length($i) return $i
+ let $l1 := string-length($s[1])
+ let $l2 := string-length($s[2])
+ let $mt := xs:integer((max(($l1,$l2)) div 2.0) - 1)
+ let $mc := for $i in 1 to min( ($l1 , $l2) )
+ let $auxmatch := substring($s[2], max((1,$i - $mt)), $mt * 2 )
+ return for $j in 1 to string-length($auxmatch)
+ where substring($auxmatch, $j, 1) = substring($s[1], $i, 1)
+ return <match char="{substring($s[1], $i, 1)}" pos1="{$i}" pos2="{$j + max((1,$i - $mt)) - 1}" />
+ let $m := if (count($mc) = 0) then (1) else (count($mc))
+ let $t := count( for $i in $mc, $j in $mc where $i/@pos1>$j/@pos1 and $i/@pos2<$j/@pos2 return $i )
+ let $dist := xs:double((($m div $l1) + ($m div $l2) + (($m - $t) div $m)) div 3)
+ return $dist
+};
+
+(:~
+ : Returns the Jaro-Winkler similarity coefficient between two strings.
+ :
+ : This similarity coefficient corresponds to an extension of the Jaro similarity coefficient that weights or
+ : penalizes strings based on their similarity at the beginning of the string, up to a given prefix size.
+ :
+ : <br/>
+ : Example usage : <pre> jaro-winkler("DWAYNE", "DUANE", 4, 0.1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.8577777777777778 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $prefix The number of characters to consider when testing for equal prefixes in the strings.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes.
+ : @return The Jaro-Winkler similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq
+ :)
+declare function simc:jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $prefix as xs:integer, $fact as xs:double ) as xs:double {
+ let $jaro := simc:jaro( $s1 , $s2 )
+ let $cc := for $i in 1 to min(($prefix, string-length($s1), string-length($s2)))
+ where substring($s1, 0, $i) = substring($s2, 0, $i) return $i
+ return ($jaro + ( $fact * max($cc) * ( 1 - $jaro ) ) )
+};
+
+(:~
+ : Returns the Needleman-Wunsch distance between two strings.
+ :
+ : The Needleman-Wunsch distance is similar to the basic edit distance metric, adding a
+ : variable cost adjustment to the cost of a gap (i.e., an insertion or deletion) in the
+ : distance metric.
+ :
+ : <br/>
+ : Example usage : <pre> needleman-wunsch("KAK", "KQRK", 1, 1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $score The score value.
+ : @param $penalty The penalty value.
+ : @return The Needleman-Wunsch distance between the two strings.
+ : @example test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq
+ :)
+declare function simc:needleman-wunsch ( $s1 as xs:string, $s2 as xs:string, $score as xs:integer, $penalty as xs:integer ) as xs:double{
+
+ if(string-length($s1) = 0) then string-length($s2)* - $penalty else
+ if(string-length($s2) = 0) then string-length($s1)* - $penalty else
+ max((
+ simc:needleman-wunsch(substring($s1, 2), $s2, $score, $penalty) - $penalty ,
+ simc:needleman-wunsch($s1, substring($s2, 2), $score, $penalty) - $penalty ,
+ simc:needleman-wunsch(substring($s1, 2), substring($s2, 2), $score, $penalty) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then $score else -$penalty )
+ ))
+};
+
+(:~
+ : Returns the Smith-Waterman distance between two strings.
+ :
+ : <br/>
+ : Example usage : <pre> smith-waterman("ACACACTA", "AGCACACA", 2, 1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 12 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $score The score value.
+ : @param $penalty The penalty value.
+ : @return The Smith-Waterman distance between the two strings.
+ :)
+declare function simc:smith-waterman ( $s1 as xs:string, $s2 as xs:string, $score as xs:integer, $penalty as xs:integer ) as xs:double{
+ if(string-length($s1) = 0) then 0 else
+ if(string-length($s2) = 0) then 0 else
+ max((
+ 0,
+ simc:smith-waterman(substring($s1, 2), $s2, $score, $penalty) - $penalty ,
+ simc:smith-waterman($s1, substring($s2, 2), $score, $penalty) - $penalty ,
+ simc:smith-waterman(substring($s1, 2), substring($s2, 2), $score, $penalty) + ( if(substring($s1, 1, 1) = substring($s2, 1, 1)) then $score else -$penalty )
+ ))
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/consolidation.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,579 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data consolidation functions that generally take as input a sequence of XML nodes
+ : and apply some rule in order do decide which node is better suited to represent the entire sequence.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation,
+ : although the consolidation functions based on matching sequences against XPath expressions require
+ : some form of dynamic evaluation for XPath expressions,
+ : such as the x:eval() function provided in the Qizx XQuery Engine.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the single most frequent node in a sequence of nodes provided as input.
+ : If more then one answer is possible, returns the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-frequent( ( "a", "a", "b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The most frequent node in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-frequent.xq
+ :)
+declare function con:most-frequent ( $s ) {
+ (for $str in set:distinct($s) order by count($s[deep-equal(.,$str)]) descending return $str)[1]
+};
+
+(:~
+ : Returns the single less frequent node in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-frequent( ( "a", "a", "b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("b") </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The least frequent node in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/leastfrequent_1.xq
+ :)
+declare function con:least-frequent ( $s ) {
+ let $aux := for $str in set:distinct($s) order by count($s[deep-equal(.,$str)]) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single longest string, in terms of the number of characters, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> con:longest( ( "a", "aa", "aaa") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("aaa") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The longest string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/longest_1.xq
+ :)
+declare function con:longest ( $s as xs:string* ) as xs:string? {
+ let $aux := for $str in $s order by string-length($str) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single shortest string, in terms of the number of characters, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> shortest( ( "a", "aa", "aaa") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The shortest string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/shortest_1.xq
+ :)
+declare function con:shortest( $s as xs:string* ) as xs:string? {
+ let $aux := for $str in $s order by string-length($str) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single longest string, in terms of the number of tokens, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-tokens( ( "a b c", "a b", "a"), " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a b c") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The longest string in the input sequence, in terms of the number of tokens.
+ : @example test/Queries/data-cleaning/consolidation/most-tokens.xq
+ :)
+declare function con:most-tokens ( $s as xs:string*, $r as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by count(tokenize($str,$r)) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single shortest string, in terms of the number of tokens, in a sequence of strings provided as input.
+ : If more then one answer is possible, return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-tokens( ( "a b c", "a b", "a"), " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The shortest string in the input sequence, in terms of the number of tokens.
+ : @example test/Queries/data-cleaning/consolidation/least-tokens.xq
+ :)
+declare function con:least-tokens ( $s as xs:string*, $r as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by count(tokenize($str,$r)) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the strings from an input sequence of strings that match a particular regular expression.
+ :
+ : <br/>
+ : Example usage : <pre> matching( ( "a A b", "c AAA d", "e BB f"), "A+" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "a A b", "c AAA d") </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $r The regular expression to be used in the matching.
+ : @return The strings in the input sequence that match the input regular expression.
+ : @example test/Queries/data-cleaning/consolidation/matching_1.xq
+ :)
+declare function con:matching ( $s as xs:string*, $r as xs:string ) as xs:string* {
+ for $str in $s where matches($str,$r) return $str
+};
+
+(:~
+ : Returns the single string, from an input sequence of strings, that appears more frequently as part
+ : of the other strings in the sequence. If no such string exists, the function returns an empty sequence.
+ : If more then one answer is possible, the function returns the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> super-string( ( "aaa bbb ccc", "aaa bbb", "aaa ddd", "eee fff" ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "aaa bbb" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @return The string that appears more frequently as part of the other strings in the sequence.
+ : @example test/Queries/data-cleaning/consolidation/superstring_1.xq
+ :)
+declare function con:superstring ( $s as xs:string* ) as xs:string? {
+ let $aux :=
+ for $str in $s
+ let $cnt := count ( for $str2 in $s return if(contains($str2,$str)) then $str else () )
+ where $cnt > 1
+ order by $cnt descending
+ return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single most similar string, in terms of the edit distance metric towards an input string,
+ : in a sequence of strings provided as input. If more than one string has a maximum similarity (a minimum
+ : value for the edit distance metric), the function return the first string according to the order of the
+ : input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "aaabbb" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $m The string towards which we want to measure the edit distance.
+ : @return The most similar string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq
+ :)
+declare function con:most-similar-edit-distance ( $s as xs:string*, $m as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by simc:edit-distance($str,$m) return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single least similar string, in terms of the edit distance metric towards an input string,
+ : in a sequence of strings provided as input. If more than one string has a minimum similarity (a maximum
+ : value for the edit distance metric), return the first string according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( "eeefff" ) </pre>
+ :
+ : @param $s A sequence of strings.
+ : @param $m The string towards which we want to measure the edit distance.
+ : @return The least similar string in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq
+ :)
+declare function con:least-similar-edit-distance ( $s as xs:string*, $m as xs:string ) as xs:string? {
+ let $aux := for $str in $s order by simc:edit-distance($str,$m) descending return $str
+ return if (count($aux) = 0) then () else ($aux[1])
+};
+
+(:~
+ : Returns the single node having the largest number of descending elements (sub-elements at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-elements( ( <a><b/></a>, <a/>, <b/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/></a>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-elements.xq
+ :)
+declare function con:most-elements ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::element()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of descending attributes (attributes at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a att1="a1" att2="a2"/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-attributes.xq
+ :)
+declare function con:most-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::*/attribute()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of descending nodes (sub-nodes at any given depth) in a
+ : sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-nodes( ( <a><b/></a>, <a/>, <b/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/></a>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-nodes.xq
+ :)
+declare function con:most-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::node()) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending elements (sub-elements at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-elements( ( <a><b/></a>, <b><c/></b>, <d/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<d/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-elements.xq
+ :)
+declare function con:least-elements ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::element()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending attributes (attributes at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<c/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-attributes.xq
+ :)
+declare function con:least-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::*/attribute()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of descending nodes (sub-nodes at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<d/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-nodes.xq
+ :)
+declare function con:least-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count($str/descendant-or-self::node()) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending elements (sub-elements at any
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-elements( ( <a><b/><c/><d/></a>, <a><b/><b/><c/></a>, <a/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/><c/><d/></a>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-elements.xq
+ :)
+declare function con:most-distinct-elements ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::element())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending attributes (attributes at any
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-attributes( ( <a att1="a1" att2="a2" att3="a3"/>, <a att1="a1" att2="a2"><b att2="a2" /></a>, <c/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a att1="a1" att2="a2" att3="a3"/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq
+ :)
+declare function con:most-distinct-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::*/attribute())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the largest number of distinct descending nodes (sub-nodes at any given depth) in
+ : a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-distinct-nodes( ( <a><b/></a>, <a><a/></a>, <b/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/></a>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the largest number of distinct descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq
+ :)
+declare function con:most-distinct-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::node())) descending return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending elements (sub-elements at any
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-elements( ( <a><b/></a>, <b><c/></b>, <d/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<d/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending elements in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-elements.xq
+ :)
+declare function con:least-distinct-elements ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::element())) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending attributes (attributes at any
+ : given depth) in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<c/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending attributes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq
+ :)
+declare function con:least-distinct-attributes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::*/attribute())) return $str)[1]
+};
+
+(:~
+ : Returns the single node having the smallest number of distinct descending nodes (sub-nodes at any given depth)
+ : in a sequence of nodes provided as input.
+ : If more then one answer is possible, return the first node according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-distinct-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<d/>) </pre>
+ :
+ : @param $s A sequence of nodes.
+ : @return The node having the smallest number of distinct descending nodes in the input sequence.
+ : @example test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq
+ :)
+declare function con:least-distinct-nodes ( $s ) {
+ (for $str in set:distinct($s) order by count(set:distinct($str/descendant-or-self::node())) return $str)[1]
+};
+
+(:~
+ : Returns the elements from an input sequence of elements that, when matched to a given set of XPath expressions,
+ : produce a non-empty set of nodes in all the cases.
+ :
+ : <br/>
+ : Example usage : <pre> all-xpaths( ( <a><b/></a>, <c><d/></c>, <d/>), (".//b") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> (<a><b/></a>) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The elements that, when matched to the given set of XPath expressions, always return a non-empty set of nodes.
+ :)
+declare function con:all-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ for $str in set:distinct($s)
+ where every $path in $paths satisfies count(
+ x:eval( concat( "<xml>",
+ x:serialize ( $str , <options omit-xml-declaration="true" /> ),
+ if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") ,
+ $path) ) ) > 0
+ return $str
+ :)
+ ""
+};
+
+(:~
+ : Returns the elements from a sequence of elements that, when matched to a given set of XPath expressions,
+ : produce a non-empty set of nodes for some of the cases.
+ :
+ : <br/>
+ : Example usage : <pre> some-xpaths( ( <a><b/></a>, <d><c/></d>, <d/>), (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( <a><b/></a> , <d><c/></d> ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The elements that, when matched to the given set of XPath expressions, return a non-empty set of nodes
+ : for at least one of the cases.
+ :)
+declare function con:some-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ for $str in set:distinct($s)
+ where some $path in $paths satisfies count(
+ x:eval( concat( "<xml>",
+ x:serialize ( $str , <options omit-xml-declaration="true" /> ),
+ if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") ,
+ $path) ) ) > 0
+ return $str
+ :)
+ ""
+};
+
+(:~
+ : Returns the single element from an input sequence of elements that matches the largest number of
+ : XPath expressions from a given set, producing a non-empty set of nodes.
+ : If more then one answer is possible, return the first element according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> most-xpaths( ( <a><b/></a>, <d><c/><b/></d>, <d/>) , (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( <d><c/><b/></d> ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The element that matches the largest number of XPath expressions producing a non-empty set of nodes.
+ :)
+
+declare function con:most-xpaths ( $s as element()* , $paths as xs:string* ) {
+ (:
+ (
+ for $str in set:distinct($s)
+ let $cnt := sum( for $path in $paths where count(
+ x:eval( concat( "<xml>",
+ x:serialize ( $str , <options omit-xml-declaration="true" /> ),
+ if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") ,
+ $path) ) ) > 0 return 1 )
+ order by $cnt descending
+ return $str
+ )[1]
+ :)
+ ""
+};
+
+(:~
+ : Returns the single element from an input sequence of elements that matches the smallest number of
+ : XPath expressions from a given set, producing a non-empty set of nodes.
+ : If more then one answer is possible, return the first element according to the order of the input sequence.
+ :
+ : <br/>
+ : Example usage : <pre> least-xpaths( ( <a><b/></a>, <d><c/><b/></d>, <d/>) , (".//b", ".//c") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( $lt;d/> ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $paths A sequence of strings denoting XPath expressions.
+ : @return The element that matches the smallest number of XPath expressions producing a non-empty set of nodes.
+ :)
+
+declare function con:least-xpaths ( $s as element()* , $paths as xs:string* ) {
+(:
+ (
+ for $str in set:distinct($s)
+ let $cnt := sum( for $path in $paths where count(
+ x:eval( concat( "<xml>",
+ x:serialize ( $str , <options omit-xml-declaration="true" /> ),
+ if(starts-with($path,"/")) then ("</xml>") else ("</xml>/") ,
+ $path) ) ) > 0 return 1 )
+ order by $cnt
+ return $str
+ )[1]
+ :)
+ ""
+};
+
+(:~
+ : Returns the nodes from an input sequence of nodes that validate against a given XML Schema.
+ :
+ : <br/>
+ : Example usage : <pre> validating-schema ( ( <a/> , <b/> ), <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"><xs:element name="a" /></xs:schema> ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ( <a/> ) </pre>
+ :
+ : @param $s A sequence of elements.
+ : @param $schema An element encoding an XML Schema.
+ : @return The nodes that validate against the XML Schema.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function con:validating-schema ( $s as element()*, $schema as element() ) {
+ false()
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/conversion.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,407 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data conversion functions for processing calendar dates,
+ : temporal values, currency values, units of measurement, location names and postal addresses.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+declare namespace exref = "http://www.ecb.int/vocabulary/2002-08-01/eurofxref";
+declare namespace ann = "http://www.zorba-xquery.com/annotations";
+
+import schema namespace wp = 'http://api.whitepages.com/schema/';
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
+
+import module namespace reflection = "http://www.zorba-xquery.com/modules/reflection";
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "2.0";
+
+(:~ The key to be used when accessing the White Pages Web service :)
+declare variable $conversion:key := "06ea2f21cc15602b6a3e242e3225a81a";
+
+(:~
+ : Uses a White-pages Web service to discover information about a given name,
+ : returning a sequence of strings for the phone numbers associated to the name.
+ :
+ :
+ : @param $name The name of person or organization.
+ : @return A sequence of strings for the phone numbers associated to the name.
+ : @example test/Queries/data-cleaning/conversion/phone-from-user.xq
+ :)
+declare %ann:nondeterministic function conversion:phone-from-user ( $name as xs:string) as xs:string*{
+ let $name-value := replace($name, " ", "%20")
+ let $url := concat("http://api.whitepages.com/find_person/1.0/?name=",$name-value,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ return
+ $doc/wp:wp/wp:listings/wp:listing/wp:phonenumbers/wp:phone/wp:fullphone/text()
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given name,
+ : returning a sequence of strings for the addresses associated to the name.
+ :
+ :
+ : @param $name The name of person or organization.
+ : @return A sequence of strings for the addresses associated to the name.
+ : @example test/Queries/data-cleaning/conversion/address-from-user.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-user ( $name as xs:string) as xs:string*{
+ let $name-value := replace($name, " ", "%20")
+ let $url := concat("http://api.whitepages.com/find_person/1.0/?name=",$name-value,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ for $a in $doc/wp:wp/wp:listings/wp:listing/wp:address
+ let $fullstreet := $a/wp:fullstreet/text()
+ let $city := $a/wp:city/text()
+ let $state := $a/wp:state/text()
+ let $country := $a/wp:country/text()
+ return concat($fullstreet, ", ", $city, ", ", $state, ", ", $country)
+};
+
+
+(:~
+ : Uses a White-pages Web service to discover information about a given phone number,
+ : returning a sequence of strings for the name associated to the phone number.
+ :
+ :
+ : @param $phone-number A string with 10 digits corresponding to the phone number.
+ : @return A sequence of strings for the person or organization's name associated to the phone number.
+ : @example test/Queries/data-cleaning/conversion/user-from-phone.xq
+ :)
+declare %ann:nondeterministic function conversion:user-from-phone ( $phone-number as xs:string) as xs:string*{
+ let $url := concat("http://api.whitepages.com/reverse_phone/1.0/?phone=",$phone-number,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ return $doc/wp:wp/wp:listings/wp:listing/wp:displayname/text()
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given phone number,
+ : returning a string for the address associated to the phone number.
+ :
+ :
+ : @param $phone-number A string with 10 digits corresponding to the phone number.
+ : @return A string for the addresses associated to the phone number.
+ : @example test/Queries/data-cleaning/conversion/address-from-phone.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-phone ( $phone-number as xs:string) as xs:string*{
+ let $url := concat("http://api.whitepages.com/reverse_phone/1.0/?phone=",$phone-number,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ let $addresses :=
+ for $a in $doc/wp:wp/wp:listings/wp:listing/wp:address
+ let $fullstreet := $a/wp:fullstreet/text()
+ let $city := $a/wp:city/text()
+ let $state := $a/wp:state/text()
+ let $country := $a/wp:country/text()
+ return concat($fullstreet, ", ", $city, ", ", $state, ", ", $country)
+ return distinct-values($addresses)
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given address,
+ : returning a sequence of strings for the names associated to the address.
+ :
+ :
+ : @param $address A string corresponding to the address (ex: 5655 E Gaskill Rd, Willcox, AZ, US).
+ : @return A sequence of strings for the person or organization's names associated to the address.
+ : @example test/Queries/data-cleaning/conversion/user-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:user-from-address ( $address as xs:string) as xs:string*{
+ let $tokens := tokenize ($address, ",")
+ let $token-full-street := $tokens[position()=1]
+ let $state :=
+ if (count($tokens) = 4)
+ then replace($tokens[position()=3], " ", "")
+ else
+ if (count($tokens) = 5)
+ then replace($tokens[position()=4], " ", "")
+ else()
+ let $house := tokenize($token-full-street, " ")[position()=1]
+ let $street := replace(replace($token-full-street, "[0-9]+[ ]", ""), " ", "%20")
+ let $url := concat("http://api.whitepages.com/reverse_address/1.0/?house=",$house, ";street=",$street, ";state=",$state,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ return $doc/wp:wp/wp:listings/wp:listing/wp:displayname/text()
+};
+
+(:~
+ : Uses a White-pages Web service to discover information about a given address,
+ : returning a sequence of strings for the phone number associated to the address.
+ :
+ :
+ : @param $address A string corresponding to the address (ex: 5655 E Gaskill Rd, Willcox, AZ, US).
+ : @return A sequence of strings for the phone number or organization's names associated to the address.
+ : @example test/Queries/data-cleaning/conversion/phone-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:phone-from-address ( $address as xs:string) as xs:string*{
+ let $tokens := tokenize ($address, ",")
+ let $token-full-street := $tokens[position()=1]
+ let $state :=
+ if (count($tokens) = 4)
+ then replace($tokens[position()=3], " ", "")
+ else
+ if (count($tokens) = 5)
+ then replace($tokens[position()=4], " ", "")
+ else()
+ let $house := replace($token-full-street, "([A-Za-z]+|[0-9]+[A-Za-z][A-Za-z]|[ ]+)", "")
+ let $street-w-space := replace($token-full-street, $house, "")
+ let $street :=
+ if (substring($street-w-space, 1, 1) = " ")
+ then substring($street-w-space, 2)
+ else
+ if(substring($street-w-space, string-length($street-w-space), 1) = " ")
+ then substring($street-w-space, 1, string-length($street-w-space)-1)
+ else ()
+ let $street-form := replace($street, " ", "%20")
+ let $url := concat("http://api.whitepages.com/reverse_address/1.0/?house=",$house, ";street=",$street-form, ";state=",$state,";api_key=",$conversion:key)
+ let $doc := http:get-node($url)[2]
+ return $doc/wp:wp/wp:listings/wp:listing/wp:phonenumbers/wp:phone/wp:fullphone/text()(: if($state = "TN") then "iguais" else "dif":)
+};
+
+(:~
+ : Conversion function for units of measurement, acting as a wrapper over the CuppaIT WebService.
+ : <br/>
+ :
+ :
+ : @param $v The amount we wish to convert.
+ : @param $t The type of metric (e.g., "Distance")
+ : @param $m1 The source measurement unit metric (e.g., "meter")
+ : @param $m2 The target measurement unit metric (e.g., "mile")
+ : @return The value resulting from the conversion
+ : @example test/Queries/data-cleaning/conversion/unit-convert.xq
+ :)
+declare %ann:nondeterministic function conversion:unit-convert ( $v as xs:double, $t as xs:string, $m1 as xs:string, $m2 as xs:string ) {
+ if ( $m1 = $m2 ) then $v else
+
+let $conversion-table :=
+ <unit-conversion-rules>
+ <unit type="Distance" from="mile" to="kilometer" value="1.609344" />
+ <unit type="Distance" from="mile" to="angstrom" value="16100000000000" />
+ <unit type="Distance" from="mile" to="picometer" value="1610000000000000" />
+ <unit type="Distance" from="mile" to="nanometer" value="1610000000000" />
+ <unit type="Distance" from="mile" to="microometer" value="1610000000" />
+ <unit type="Distance" from="mile" to="millimeter" value="1610000" />
+ <unit type="Distance" from="mile" to="centimeter" value="161000" />
+ <unit type="Distance" from="mile" to="meter" value="1610" />
+ <unit type="Distance" from="mile" to="inch" value="63400" />
+ <unit type="Distance" from="mile" to="feet" value="5280" />
+ <unit type="Distance" from="kilometer" to="meter" value="1000" />
+ <unit type="Distance" from="kilometer" to="picometer" value="1000000000000000" />
+ <unit type="Distance" from="kilometer" to="angstrom" value="10000000000000" />
+ <unit type="Distance" from="kilometer" to="nanometer" value="1000000000000" />
+ <unit type="Distance" from="kilometer" to="micrometer" value="1000000000" />
+ <unit type="Distance" from="kilometer" to="millimeter" value="1000000" />
+ <unit type="Distance" from="kilometer" to="centimeter" value="100000" />
+ <unit type="Distance" from="kilometer" to="inch" value="39400" />
+ <unit type="Distance" from="kilometer" to="feet" value="3280" />
+ <unit type="Distance" from="meter" to="centimeter" value="100" />
+ <unit type="Distance" from="meter" to="picometer" value="1000000000000" />
+ <unit type="Distance" from="meter" to="angstrom" value="10000000000" />
+ <unit type="Distance" from="meter" to="nanometer" value="1000000000" />
+ <unit type="Distance" from="meter" to="micrometer" value="1000000" />
+ <unit type="Distance" from="meter" to="millimeter" value="1000" />
+ <unit type="Distance" from="meter" to="inch" value="39.4" />
+ <unit type="Distance" from="meter" to="feet" value="3.28" />
+ <unit type="Distance" from="centimeter" to="millimeter" value="10" />
+ <unit type="Distance" from="millimeter" to="micrometer" value="1000" />
+ <unit type="Distance" from="micrometer" to="nanometer" value="1000" />
+ <unit type="Distance" from="nanometer" to="angstrom" value="10" />
+ <unit type="Distance" from="angstrom" to="picometer" value="100" />
+ <unit type="Distance" from="inch" to="feet" value="0.0833" />
+ <unit type="Mass" from="tons" to="kilograms" value="907.18474" />
+ <unit type="Mass" from="tons" to="pounds" value="2000" />
+ <unit type="Mass" from="tons" to="ounces" value="32000" />
+ <unit type="Mass" from="tons" to="grams" value="907184.74" />
+ <unit type="Mass" from="tons" to="milligrams" value="907184740" />
+ <unit type="Mass" from="kilograms" to="pounds" value="2.2046226" />
+ <unit type="Mass" from="kilograms" to="grams" value="1000" />
+ <unit type="Mass" from="kilograms" to="milligrams" value="1000000" />
+ <unit type="Mass" from="grams" to="milligrams" value="1000" />
+ <unit type="Mass" from="pounds" to="ounces" value="16" />
+ <unit type="Mass" from="pounds" to="grams" value="453.59237" />
+ <unit type="Mass" from="pounds" to="milligrams" value="453592.37" />
+ <unit type="Mass" from="ounces" to="kilograms" value="0.028349523" />
+ <unit type="Mass" from="ounces" to="grams" value="28.349523" />
+ <unit type="Mass" from="ounces" to="milligrams" value="28349.523" />
+ <unit type="Volume" from="liters" to="cubic centimeters" value="1000" />
+ <unit type="Energy" from="jouls" to="calories" value="0.239" />
+ <unit type="Pressure" from="pascals" to="kilopascals" value="0.001" />
+ <unit type="Pressure" from="pascals" to="bars" value="0.000001" />
+ <unit type="Pressure" from="pascals" to="mmHg" value="0.00750064" />
+ <unit type="Pressure" from="pascals" to="torrs" value="0.00750064" />
+ <unit type="Pressure" from="atmospheres" to="pascals" value="101325" />
+ <unit type="Pressure" from="atmospheres" to="kilopascals" value="101.325" />
+ <unit type="Pressure" from="atmospheres" to="bars" value="1.01325" />
+ <unit type="Pressure" from="atmospheres" to="mmHg" value="760" />
+ <unit type="Pressure" from="atmospheres" to="torrs" value="760" />
+ <unit type="Pressure" from="atmospheres" to="psi" value="14.7" />
+ <unit type="Pressure" from="psi" to="pascals" value="6890" />
+ <unit type="Pressure" from="psi" to="kilopascals" value="6.89" />
+ <unit type="Pressure" from="psi" to="bars" value="0.0689" />
+ <unit type="Pressure" from="psi" to="mmHg" value="51.7" />
+ <unit type="Pressure" from="psi" to="torrs" value="51.7" />
+ <unit type="Pressure" from="bars" to="kilopascals" value="100" />
+ <unit type="Pressure" from="bars" to="mmHg" value="750.064" />
+ <unit type="Pressure" from="bars" to="torrs" value="750.064" />
+ <unit type="Pressure" from="kilopascals" to="mmHg" value="7.50064" />
+ <unit type="Pressure" from="kilopascals" to="torrs" value="7.50064" />
+ <unit type="Pressure" from="mmHg" to="torrs" value="1" />
+ <unit type="Temperature" from="celsius" to="fahrenheit" value="* 9 div 5 + 32" />
+ <unit type="Temperature" from="celsius" to="kelvin" value="+ 273.15" />
+ <unit type="Temperature" from="kelvin" to="celsius" value="- 273.15" />
+ <unit type="Temperature" from="kelvin" to="fahrenheit" value="* 9 div 5 - 273.15 * 9 div 5 + 32" />
+ <unit type="Temperature" from="fahrenheit" to="celsius" value="* 5 div 9 - 32 * 5 div 9" />
+ <unit type="Temperature" from="fahrenheit" to="kelvin" value="* 5 div 9 - 32 * 5 div 9 + 273.15" />
+</unit-conversion-rules>
+
+let $from := $conversion-table/unit[@type=$t and @from=$m1] |
+ ( for $it in $conversion-table/unit[@type=$t and @to=$m1] return
+ if (compare($t, "Temperature") != 0) then
+ copy $aux := $it
+ modify (
+ replace value of node $aux/@value with 1.0 div $aux/@value,
+ replace value of node $aux/@from with $aux/@to,
+ replace value of node $aux/@to with $aux/@from
+ )
+ return $aux
+ else()
+ )
+
+return
+if (compare($t, "Temperature") = 0) then reflection:eval(concat($v , $conversion-table//unit[@from=$m1][@to=$m2]/@value))
+else
+ if ( $from[@to=$m2]) then ( $v * $from[@to=$m2]/@value )
+ else ( for $i in $from return conversion:unit-convert ( $v * $i/@value , $t , $i/@to , $m2 ) )[1]
+};
+
+(:~
+ : Placename to geospatial coordinates converter, acting as a wrapper over the Yahoo! geocoder service.
+ :
+ :
+ : @param $q A sequence of strings corresponding to the different components (e.g., street, city, country, etc.) of the place name.
+ : @return The pair of latitude and longitude coordinates associated with the input address.
+ : @example test/Queries/data-cleaning/conversion/geocode-from-address.xq
+ :)
+declare %ann:nondeterministic function conversion:geocode-from-address ( $q as xs:string* ) as xs:double* {
+ let $id := ""
+ let $url := "http://where.yahooapis.com/geocode?q="
+ let $q2 := string-join(for $i in $q return translate($i," ","+"),",")
+ let $call := concat($url,$q2,"&appid=",$id)
+ let $doc := http:get-node($call)[2]
+ return ( xs:double($doc/ResultSet/Result/latitude/text()) , xs:double($doc/ResultSet/Result/longitude/text()) )
+};
+
+(:~
+ : Geospatial coordinates to placename converter, acting as a wrapper over the Yahoo! reverse geocoder service.
+ :
+ :
+ : @param $lat Geospatial latitude.
+ : @param $lon Geospatial longitude.
+ : @return The sequence of strings corresponding to the different components (e.g., street, city, country, etc.) of the place name that corresponds to the input geospatial coordinates.
+ : @example test/Queries/data-cleaning/conversion/address-from-geocode.xq
+ :)
+declare %ann:nondeterministic function conversion:address-from-geocode ( $lat as xs:double, $lon as xs:double ) as xs:string* {
+ let $id := ""
+ let $url := "http://where.yahooapis.com/geocode?q="
+ let $q := concat($lat,",+",$lon)
+ let $call := concat($url,$q,"&gflags=R&appid=",$id)
+ let $doc := http:get-node($call)[2]
+ return distinct-values( (if (string-length($doc//xs:string(*:country)) > 0) then ($doc//xs:string(*:country)) else (),
+ if (string-length($doc//xs:string(*:state)) > 0) then ($doc//xs:string(*:state)) else (),
+ if (string-length($doc//xs:string(*:county)) > 0) then ($doc//xs:string(*:county)) else (),
+ if (string-length($doc//xs:string(*:city)) > 0) then ($doc//xs:string(*:city)) else (),
+ if (string-length($doc//xs:string(*:neighborhood)) > 0) then ($doc//xs:string(*:neighborhood)) else (),
+ if (string-length($doc//xs:string(*:street)) > 0) then ($doc//xs:string(*:street)) else (),
+ if (string-length($doc//xs:string(*:house)) > 0) then ($doc//xs:string(*:house)) else () ) )
+};
+
+(:~
+ : Currency conversion function, acting as a wrapper over the WebService from the European Central Bank.
+ :
+ : WebService documentation at http://www.ecb.int/stats/exchange/eurofxref/html/index.en.html
+ :
+ :
+ : @param $v The amount we wish to convert.
+ : @param $m1 The source currency (e.g., "EUR").
+ : @param $m2 The target currency (e.g., "USD").
+ : @param $date The reference date.
+ : @return The value resulting from the conversion.
+ : @error conversion:notsupported if the date, the source currency type or the target currency type are not known to the service.
+ : @see http://www.ecb.int/stats/exchange/eurofxref/html/index.en.html
+ : @example test/Queries/data-cleaning/conversion/currency-convert.xq
+ :)
+declare %ann:nondeterministic function conversion:currency-convert ( $v as xs:double, $m1 as xs:string, $m2 as xs:string, $date as xs:string ) {
+ let $daily := "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-daily.xml"
+ let $hist := "http://www.ecb.europa.eu/stats/eurofxref/eurofxref-hist.xml"
+ let $doc := if (string-length($date) = 0) then http:get-node($daily)[2] else
+ ((for $a in http:get-node($hist)[2]//exref:Cube[
+ xs:string(@time)<=$date] order by $a/xs:string(@time) descending return $a)[1])
+ let $toEUR := if ( $m1="EUR" ) then (xs:double(1.0)) else ( $doc//exref:Cube[xs:string(@currency)=$m1]/xs:double(@rate) )
+ let $fromEUR := if ( $m2="EUR" ) then (xs:double(1.0)) else ( $doc//exref:Cube[xs:string(@currency)=$m2]/xs:double(@rate) )
+ let $result := ($v div $toEUR) * $fromEUR
+ return if (matches(string($result),"-?[0-9]+(\.[0-9]+)?")) then ($result)
+ else (error(QName('http://www.zorba-xquery.com/modules/data-cleaning/conversion', 'conversion:notsupported'), data($result)))
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings
+ : for the phone numbers associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the phone numbers associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:phone-from-domain ( $domain as xs:string ) {
+ ()
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings
+ : for the addresses associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the addresses associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:address-from-domain ( $domain as xs:string ) {
+ ()
+};
+
+(:~
+ : Uses a whois service to discover information about a given domain name, returning a sequence of strings
+ : for the person or organization names associated to the name.
+ :
+ : @param $addr A string with the domain.
+ : @return A sequence of strings for the person or organization names associated to the domain name.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function conversion:name-from-domain ( $domain as xs:string ) {
+ ()
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/hybrid-string-similarity.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,223 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides hybrid string similarity functions, combining the properties of
+ : character-based string similarity functions and token-based string similarity functions.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation,
+ : although the module requires the trigonometic functions of XQuery 1.1 or a math extension
+ : function such as sqrt($x as numeric) for computing the square root.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+
+(: In the QizX os Saxon XQuery engines, it is possible to call external functions from the Java math library :)
+(: declare namespace math = "java:java.lang.Math"; :)
+declare namespace math = "http://www.w3.org/2005/xpath-functions/math";
+
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Soundex phonetic similarity function is used to discover token identity, which is equivalent to saying that
+ : this function returns the cosine similarity coefficient between sets of Soundex keys.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-soundex("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets of Soundex keys extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq
+ :)
+declare function simh:soft-cosine-tokens-soundex ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $keys1 := for $kt1 in tokenize($s1,$r) return simp:soundex-key($kt1)
+ let $keys2 := for $kt1 in tokenize($s2,$r) return simp:soundex-key($kt1)
+ return simt:cosine($keys1, $keys2)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Metaphone phonetic similarity function is used to discover token identity, which is equivalent to saying that
+ : this function returns the cosine similarity coefficient between sets of Metaphone keys.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-metaphone("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets Metaphone keys extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq
+ :)
+declare function simh:soft-cosine-tokens-metaphone ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $keys1 := for $kt1 in tokenize($s1,$r) return simp:metaphone-key($kt1)
+ let $keys2 := for $kt1 in tokenize($s2,$r) return simp:metaphone-key($kt1)
+ return simt:cosine($keys1, $keys2)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Edit Distance similarity function is used to discover token identity, and tokens having an edit distance
+ : bellow a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-edit-distance("The FLWOR Foundation", "FLWOR Found.", " +", 0 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.408248290463863 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ :)
+declare function simh:soft-cosine-tokens-edit-distance ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:integer ) as xs:double {
+(:
+ let $tokens1 := tokenize($s1,$r)
+ let $tokens2 := tokenize($s2,$r)
+ let $tokens := ($tokens1, $tokens2)
+ let $vocab := for $a at $apos in $tokens
+ where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:edit-distance($ba,$a) <= $t)
+ return $a
+ let $freq1 := for $a1 in $vocab return count($tokens1[simc:edit-distance(.,$a1) <= $t])
+ let $freq2 := for $a2 in $vocab return count($tokens2[simc:edit-distance(.,$a2) <= $t])
+ let $freq1pow := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult := for $freq at $pos in $freq1 return $freq * $freq2[$pos]
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+ :)
+ xs:double(0)
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Jaro similarity function is used to discover token identity, and tokens having a Jaro similarity above
+ : a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-jaro("The FLWOR Foundation", "FLWOR Found.", " +", 1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq
+ :)
+declare function simh:soft-cosine-tokens-jaro ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:double ) as xs:double {
+ let $tokens1 := tokenize($s1,$r)
+ let $tokens2 := tokenize($s2,$r)
+ let $tokens := ($tokens1, $tokens2)
+ let $vocab := for $a at $apos in $tokens
+ where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:jaro($ba,$a) >= $t)
+ return $a
+ let $freq1 := for $a1 in $vocab return count($tokens1[simc:jaro(.,$a1) >= $t])
+ let $freq2 := for $a2 in $vocab return count($tokens2[simc:jaro(.,$a2) >= $t])
+ let $freq1pow := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult := for $freq at $pos in $freq1 return $freq * $freq2[$pos]
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings.
+ : The tokens from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ : The Jaro-Winkler similarity function is used to discover token identity, and tokens having a Jaro-Winkler
+ : similarity above a given threshold are considered as matching tokens.
+ :
+ : <br/>
+ : Example usage : <pre> soft-cosine-tokens-jaro-winkler("The FLWOR Foundation", "FLWOR Found.", " +", 1, 4, 0.1 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.45 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @param $t A threshold for the similarity function used to discover token identity.
+ : @param $prefix The number of characters to consider when testing for equal prefixes with the Jaro-Winkler metric.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes with the Jaro-Winkler metric.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq
+ :)
+declare function simh:soft-cosine-tokens-jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $r as xs:string, $t as xs:double, $prefix as xs:integer?, $fact as xs:double? ) as xs:double {
+ let $tokens1 := tokenize($s1,$r)
+ let $tokens2 := tokenize($s2,$r)
+ let $tokens := ($tokens1, $tokens2)
+ let $vocab := for $a at $apos in $tokens
+ where every $ba in subsequence($tokens, 1, $apos - 1) satisfies not(simc:jaro-winkler($ba,$a,$prefix,$fact) >= $t)
+ return $a
+ let $freq1 := for $a1 in $vocab return count($tokens1[simc:jaro-winkler(.,$a1,$prefix,$fact) >= $t])
+ let $freq2 := for $a2 in $vocab return count($tokens2[simc:jaro-winkler(.,$a2,$prefix,$fact) >= $t])
+ let $freq1pow := for $aa1 in $freq1 return $aa1 * $aa1
+ let $freq2pow := for $aa2 in $freq2 return $aa2 * $aa2
+ let $mult := for $freq at $pos in $freq1 return $freq * $freq2[$pos]
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the Monge-Elkan similarity coefficient between two strings, using the Jaro-Winkler
+ : similarity function to discover token identity.
+ :
+ : <br/>
+ : Example usage : <pre> monge-elkan-jaro-winkler("Comput. Sci. and Eng. Dept., University of California, San Diego", "Department of Computer Scinece, Univ. Calif., San Diego", 4, 0.1) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.992 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $prefix The number of characters to consider when testing for equal prefixes with the Jaro-Winkler metric.
+ : @param $fact The weighting factor to consider when the input strings have equal prefixes with the Jaro-Winkler metric.
+ : @return The Monge-Elkan similarity coefficient between the two strings.
+ : @example test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq
+ :)
+declare function simh:monge-elkan-jaro-winkler ( $s1 as xs:string, $s2 as xs:string, $prefix as xs:integer, $fact as xs:double ) as xs:double{
+ let $s1tokens := tokenize($s1, " ")
+ let $s2tokens := tokenize($s1, " ")
+ let $length := min((count($s1tokens), count($s2tokens)))
+ let $res := for $s1n in $s1tokens
+ return max(for $s2n in $s2tokens return simc:jaro-winkler($s1n,$s2n,$prefix,$fact))
+ return (1 div $length) * sum($res)
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/normalization.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1560 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides data normalization functions for processing calendar dates,
+ : temporal values, currency values, units of measurement, location names and postal addresses.
+ :
+ : These functions are particularly useful for converting different data representations into cannonical formats.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins and Diogo Simões
+ : @project data processing/data cleaning
+ :)
+
+module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
+
+declare namespace ann = "http://www.zorba-xquery.com/annotations";
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "2.0";
+
+(:~
+ : Converts a given string representation of a date value into a date representation valid according
+ : to the corresponding XML Schema type.
+ :
+ :
+ : @param $sd The string representation for the date
+ : @param $format An optional parameter denoting the format used to represent the date in the string, according to a
+ : sequence of conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed
+ : by a single letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion
+ : specification is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ : <pre>
+ : '%b' Abbreviated month name in the current locale.<br/>
+ : '%B' Full month name in the current locale.<br/>
+ : '%d' Day of the month as decimal number (01-31).<br/>
+ : '%m' Month as decimal number (01-12).<br/>
+ : '%x' Date, locale-specific.<br/>
+ : '%y' Year without century (00-99).<br/>
+ : '%Y' Year with century.<br/>
+ : '%C' Century (00-99): the integer part of the year divided by 100.<br/>
+ : '%D' Locale-specific date format such as '%m/%d/%y'.<br/>
+ : '%e' Day of the month as decimal number (1-31), with a leading pace for a single-digit number.<br/>
+ : '%F' Equivalent to %Y-%m-%d (the ISO 8601 date format).<br/>
+ : '%h' Equivalent to '%b'.<br/>
+ :</pre>
+ :
+ : @return The date value resulting from the conversion.
+ : @example test/Queries/data-cleaning/normalization/to-date.xq
+ :)
+declare function normalization:to-date ( $sd as xs:string, $format as xs:string? ) as xs:string{
+
+
+ let $dictionary := normalization:month-dictionary()
+ let $format-tokens := tokenize($format, "[ %\-/:]+")[position()>1]
+ let $sd-tokens :=
+ if (contains($sd, "-") or contains($sd, "/") or contains($sd, " "))
+ then tokenize ($sd, "[ \-/]+")
+ else let $ydtoken := tokenize(replace($sd, "[A-Za-z]", " "), " ")
+ let $ft := $ydtoken[position()=1]
+ let $lt := $ydtoken[last()]
+ let $mtoken := replace($sd, "[0-9]", "") return ($ft, $mtoken, $lt)
+ return
+ if (count($sd-tokens)>1)
+ then
+ let $year :=
+ if (count(index-of($format-tokens, "F")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "D")) != 0) then concat("19", string($sd-tokens[position() = 3]))
+ else
+
+ if (count(index-of($format-tokens, "Y")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "Y")]) else
+
+ if (count(index-of($format-tokens, "y")) != 0)
+ then
+ if(count(index-of($format-tokens, "C")) !=0)
+ then concat(string(number(string($sd-tokens[position() = index-of($format-tokens, "C")]))-1), string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+ else
+ concat("19", string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+ else "YND"
+
+ let $month :=
+ if (count(index-of($format-tokens, "h")) != 0)
+ then string($dictionary//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "h")]]/@value) else
+
+ if (count(index-of($format-tokens, "b")) != 0)
+ then string($dictionary//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "b")]]/@value)
+ else
+
+ if (count(index-of($format-tokens, "B")) != 0)
+ then string($dictionary//month[lower-case(@name) =
+ lower-case($sd-tokens[position() = index-of($format-tokens, "B")])]/@value)
+
+ else
+
+ if (count(index-of($format-tokens, "F")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "m")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "m")])
+
+ else "MND"
+
+ let $day :=
+ if (count(index-of($format-tokens, "F")) != 0)
+ then string($sd-tokens[position() = 3]) else
+
+ if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "d")) != 0)
+ then $sd-tokens[position() = index-of($format-tokens, "d")] else
+
+ if (count(index-of($format-tokens, "e")) != 0)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+ else "DND"
+
+ let $result := concat($year, "-", $month, "-", $day)
+
+ return normalization:check-date($result)
+ else()
+
+};
+
+(:~
+ : Converts a given string representation of a time value into a time representation valid according to
+ : the corresponding XML Schema type.
+ :
+ :
+ : @param $sd The string representation for the time.
+ : @param $format An optional parameter denoting the format used to represent the time in the string, according to a sequence of
+ : conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed by a single
+ : letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion specification
+ : is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ :
+ : <pre>
+ : '%H' Hours as decimal number (00-23).<br/>
+ : '%I' Hours as decimal number (01-12).<br/>
+ : '%M' Minute as decimal number (00-59).<br/>
+ : '%p' AM/PM indicator in the locale. Used in conjunction with '%I' and *not* with '%H'.<br/>
+ : '%S' Second as decimal number (00-61), allowing for up to two leap-seconds.<br/>
+ : '%X' Time, locale-specific.<br/>
+ : '%z' Offset from Greenwich, so '-0900' is 9 hours west of Greenwich.<br/>
+ : '%Z' Time zone as a character string.<br/>
+ : '%k' The 24-hour clock time with single digits preceded by a blank.<br/>
+ : '%l' The 12-hour clock time with single digits preceded by a blank.<br/>
+ : '%r' The 12-hour clock time (using the locale's AM or PM).<br/>
+ : '%R' Equivalent to '%H:%M'.<br/>
+ : '%T' Equivalent to '%H:%M:%S'.<br/>
+ :</pre>
+ :
+ : @return The time value resulting from the conversion.
+ : @example test/Queries/data-cleaning/normalization/to-time.xq
+ :)
+declare function normalization:to-time ( $sd as xs:string, $format as xs:string? ) as xs:string?{
+ let $timezoneDict := normalization:timeZone-dictionary()
+ let $format-string := replace(replace ($format, '%R', '%H:%M'), '%T', '%H:%M:%S')
+ let $format-tokens := tokenize($format-string, "( |%|:)+")[position()>1]
+ let $sd-tokens :=
+ if (contains($sd, ":") or contains($sd, ".") or contains($sd, " "))
+ then tokenize ($sd, "[ :\.]")
+ else ()
+ return
+ if (count($sd-tokens)>1)
+ then
+ let $hours :=
+ if (count(index-of($format-tokens, "T")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),1,2)
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "H")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "H")]) else
+
+ if (count(index-of($format-tokens, "k")) != 0)
+ then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "k")]))=1)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "k")]))
+ else string($sd-tokens[position() = index-of($format-tokens, "k")])
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then
+ if(lower-case(string($sd-tokens[position() = 4]))="am")
+ then string($sd-tokens[position() = 1])
+ else if(lower-case(string($sd-tokens[position() = 4]))="pm")
+ then if(string($sd-tokens[position() = 1])="12") then 12
+ else string(number(string($sd-tokens[position() = 1]))+12)
+ else()
+
+ else
+
+ if (count(index-of($format-tokens, "I")) != 0)
+ then
+ if(count(index-of($format-tokens, "p")) !=0)
+ then if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="am")
+ then string($sd-tokens[position() = index-of($format-tokens, "I")])
+ else if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="pm")
+ then if (string($sd-tokens[position() = index-of($format-tokens, "I")])="12")
+ then "12"
+ else string(number(string($sd-tokens[position() = index-of($format-tokens, "I")]))+12)
+ else()
+ else()
+
+ else
+ if (count(index-of($format-tokens, "l")) != 0)
+ then
+ if(count(index-of($format-tokens, "p")) !=0)
+ then if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="am")
+ then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "l")]))=1)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "l")]))
+ else string($sd-tokens[position() = index-of($format-tokens, "l")])
+
+ else if (lower-case(string($sd-tokens[position() = index-of($format-tokens, "p")]))="pm")
+ then if (string($sd-tokens[position() = index-of($format-tokens, "l")])="12")
+ then "12"
+ else string(number(string($sd-tokens[position() = index-of($format-tokens, "l")]))+12)
+ else()
+
+ else ()
+
+
+ else "HND"
+
+ let $minutes :=
+
+ if (count(index-of($format-tokens, "T")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),4,2)
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "M")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "M")])
+
+ else "MND"
+
+ let $seconds :=
+ if (count(index-of($format-tokens, "T")) != 0)
+ then string($sd-tokens[position() = 3]) else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),7,2)
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then string($sd-tokens[position() = 3])
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then "00"
+ else
+
+ if (count(index-of($format-tokens, "S")) != 0)
+ then $sd-tokens[position() = index-of($format-tokens, "S")] else
+
+ if (count(index-of($format-tokens, "e")) != 0)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+ else "00"
+
+ let $result :=
+
+ if (count(index-of($format-tokens, "Z")) != 0)
+ then
+ if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() = index-of($format-tokens, "Z")]]),1,1)='+')
+ then let $complement :=
+ if (number($minutes)+number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)) > 59)
+ then 1
+ else 0
+
+ let $rhours :=
+ if (string-length(string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+
+ let $rminutes :=
+ if (string-length(string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+
+
+ return concat($rhours, ":", $rminutes, ":", $seconds)
+ else
+
+ if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),1,1)='-')
+ then
+ let $complement :=
+ if (number($minutes)-number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)) < 0)
+ then -1
+ else 0
+
+ let $rhours :=
+ if( ((number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24) >= 0 )
+ then
+ if (string-length(string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else
+ if (string-length(string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (24 + number($complement) + -(number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)))) mod 24))
+
+ let $rminutes :=
+ if( ((number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60) >= 0 )
+ then
+ if (string-length(string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else
+ if (string-length(string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)))) mod 60)) = 2)
+ then (string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)))) mod 60))
+ else concat("0",
+ string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)))) mod 60))
+
+ return concat($rhours, ":", $rminutes, ":", $seconds)
+ else ()
+ else
+
+
+ if (count(index-of($format-tokens, "z")) != 0)
+ then if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='+')
+ then let $complement :=
+ if (number($minutes)+number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) > 59) then 1
+ else 0
+
+ let $rhours :=
+ if (string-length(string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+
+ let $rminutes :=
+ if (string-length(string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+
+
+ return concat($rhours, ":", $rminutes, ":", $seconds)
+ else
+
+ if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='-')
+ then
+ let $complement :=
+ if (number($minutes)-number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) < 0) then -1
+ else 0
+
+ let $rhours :=
+ if( ((number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24) >= 0 )
+ then
+ if (string-length(string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else
+ if (string-length(string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (24 + number($complement) + -(number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 24))
+
+ let $rminutes :=
+ if( ((number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60) >= 0 )
+ then
+ if (string-length(string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else
+ if (string-length(string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60)) = 2)
+ then (string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60))
+ else concat("0",
+ string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 60))
+
+ return concat($rhours, ":", $rminutes, ":", $seconds)
+ else ()
+ else
+ concat($hours, ":", $minutes, ":", $seconds)
+
+ return
+
+ normalization:check-time($result)
+ else()
+
+};
+
+(:~
+ : Converts a given string representation of a dateTime value into a dateTime representation
+ : valid according to the corresponding XML Schema type.
+ :
+ :
+ : @param $sd The string representation for the dateTime.
+ : @param $format An optional parameter denoting the format used to represent the dateTime in the string, according to a sequence
+ : of conversion specifications. In the format string, a conversion specification is introduced by '%', usually followed by a single
+ : letter or 'O' or 'E' and then a single letter. Any character in the format string that is not part of a conversion specification
+ : is interpreted literally, and the string '%%' gives '%'. The supported conversion specifications are as follows:
+ :
+ : <pre>
+ : '%b' Abbreviated month name in the current locale.<br/>
+ : '%B' Full month name in the current locale.<br/>
+ : '%c' Date and time, locale-specific.<br/>
+ : '%C' Century (00-99): the integer part of the year divided by 100.<br/>
+ : '%d' Day of the month as decimal number (01-31).<br/>
+ : '%H' Hours as decimal number (00-23).<br/>
+ : '%I' Hours as decimal number (01-12).<br/>
+ : '%j' Day of year as decimal number (001-366).<br/>
+ : '%m' Month as decimal number (01-12).<br/>
+ : '%M' Minute as decimal number (00-59).<br/>
+ : '%p' AM/PM indicator in the locale. Used in conjunction with '%I' and *not* with '%H'.<br/>
+ : '%S' Second as decimal number (00-61), allowing for up to two leap-seconds.<br/>
+ : '%x' Date, locale-specific.<br/>
+ : '%X' Time, locale-specific.<br/>
+ : '%y' Year without century (00-99).<br/>
+ : '%Y' Year with century.<br/>
+ : '%z' Offset from Greenwich, so '-0900' is 9 hours west of Greenwich.<br/>
+ : '%Z' Time zone as a character string.<br/>
+ : '%D' Locale-specific date format such as '%m/%d/%y': ISO C99 says it should be that exact format.<br/>
+ : '%e' Day of the month as decimal number (1-31), with a leading pace for a single-digit number.<br/>
+ : '%F' Equivalent to %Y-%m-%d (the ISO 8601 date format).<br/>
+ : '%g' The last two digits of the week-based year (see '%V').<br/>
+ : '%G' The week-based year (see '%V') as a decimal number.<br/>
+ : '%h' Equivalent to '%b'.<br/>
+ : '%k' The 24-hour clock time with single digits preceded by a blank.<br/>
+ : '%l' The 12-hour clock time with single digits preceded by a blank.<br/>
+ : '%r' The 12-hour clock time (using the locale's AM or PM).<br/>
+ : '%R' Equivalent to '%H:%M'.<br/>
+ : '%T' Equivalent to '%H:%M:%S'.<br/>
+ :</pre>
+ :
+ : @return The dateTime value resulting from the conversion.
+ : @example test/Queries/data-cleaning/normalization/to-dateTime.xq
+ :)
+declare function normalization:to-dateTime ( $sd as xs:string, $format as xs:string? ) as xs:string {
+ let $timezoneDict := normalization:timeZone-dictionary()
+ let $monthDict := normalization:month-dictionary()
+ let $format-string := replace(replace(replace ($format, '%R', '%H:%M'), '%T', '%H:%M:%S'), '%F', '%Y-%m-%d')
+ let $format-tokens := tokenize($format-string, "[ %\-/:\.]+")[position()>1]
+ let $sdt :=
+ if (contains($sd, ":") or contains($sd, ".") or contains($sd, " ") or contains($sd, "-")
+ or contains($sd, "/"))
+ then tokenize ($sd, "[ \-/:\.]+")
+ else ()
+ let $sdtok :=
+ if ((count(index-of($format-tokens, "z")) != 0) and (not(contains($sdt[last()], "+"))))
+ then ($sdt[position() != last()], concat("-", $sdt[position() = last()]))
+ else $sdt
+ let $sd-tokens :=
+ for $a in $sdtok
+ return
+ if (matches($a, "[0-9][0-9][A-Za-z]+[0-9][0-9]+"))
+ then (let $ydtoken := tokenize(replace($a, "[A-Za-z]", " "), " ")
+ let $ft := $ydtoken[position()=1]
+ let $lt := $ydtoken[last()]
+ let $mtoken := replace($a, "[0-9]", "") return ($ft, $mtoken, $lt))
+ else $a
+ let $timeFormat := tokenize($format, "[ :\.\-]")[position()>1]
+ let $dateFormat := tokenize($format, "[ :\.\-]")[position()=1]
+ return
+ if (count($sd-tokens)>1)
+ then
+ (:Date:)
+ let $year :=
+ if (count(index-of($format-tokens, "F")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "D")) != 0) then concat("19", string($sd-tokens[position() = 3]))
+ else
+
+ if (count(index-of($format-tokens, "Y")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "Y")]) else
+
+ if (count(index-of($format-tokens, "y")) != 0)
+ then
+ if(count(index-of($format-tokens, "C")) !=0)
+ then concat(string(number(string($sd-tokens[position() = index-of($format-tokens, "C")]))-1), string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+ else
+ concat("19", string($sd-tokens[position() = index-of($format-tokens, "y")]))
+
+ else "YND"
+
+ let $month :=
+ if (count(index-of($format-tokens, "h")) != 0)
+ then string($monthDict//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "h")]]/@value) else
+
+ if (count(index-of($format-tokens, "b")) != 0)
+ then string($monthDict//month[abrv/text() = $sd-tokens[position() = index-of($format-tokens, "b")]]/@value)
+ else
+
+ if (count(index-of($format-tokens, "B")) != 0)
+ then string($monthDict//month[lower-case(@name) =
+ lower-case($sd-tokens[position() = index-of($format-tokens, "B")])]/@value)
+
+ else
+
+ if (count(index-of($format-tokens, "F")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "m")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "m")])
+
+ else "MND"
+
+ let $day :=
+ if (count(index-of($format-tokens, "F")) != 0)
+ then string($sd-tokens[position() = 3]) else
+
+ if (count(index-of($format-tokens, "D")) != 0) then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "d")) != 0)
+ then $sd-tokens[position() = index-of($format-tokens, "d")] else
+
+ if (count(index-of($format-tokens, "e")) != 0)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+ else "DND"
+
+ (:Time:)
+ let $hours :=
+ if (count(index-of($format-tokens, "T")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),1,2)
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 1])
+ else
+
+ if (count(index-of($format-tokens, "H")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "H")]) else
+
+ if (count(index-of($format-tokens, "k")) != 0)
+ then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "k")]))=1)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "k")]))
+ else string($sd-tokens[position() = index-of($format-tokens, "k")])
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then
+ if(lower-case(string($sd-tokens[position() = 4]))="am")
+ then string($sd-tokens[position() = 1])
+ else if(lower-case(string($sd-tokens[position() = 4]))="pm")
+ then if(string($sd-tokens[position() = 1])="12") then 12
+ else string(number(string($sd-tokens[position() = 1]))+12)
+ else()
+
+ else
+
+ if (count(index-of($format-tokens, "I")) != 0)
+ then
+ if(count(index-of($format-tokens, "p")) !=0)
+ then if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="am")
+ then string($sd-tokens[position() = index-of($format-tokens, "I")])
+ else if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="pm")
+ then if (string($sd-tokens[position() = index-of($format-tokens, "I")])="12")
+ then "12"
+ else string(number(string($sd-tokens[position() = index-of($format-tokens, "I")]))+12)
+ else()
+ else()
+
+ else
+ if (count(index-of($format-tokens, "l")) != 0)
+ then
+ if(count(index-of($format-tokens, "p")) !=0)
+ then if (lower-case(string($sd-tokens[position() =
+ index-of($format-tokens, "p")]))="am")
+ then if(string-length(string($sd-tokens[position() = index-of($format-tokens, "l")]))=1)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "l")]))
+ else string($sd-tokens[position() = index-of($format-tokens, "l")])
+
+ else if (lower-case(string($sd-tokens[position() = index-of($format-tokens, "p")]))="pm")
+ then if (string($sd-tokens[position() = index-of($format-tokens, "l")])="12")
+ then "12"
+ else string(number(string($sd-tokens[position() = index-of($format-tokens, "l")]))+12)
+ else()
+
+ else ()
+
+
+ else "HND"
+
+ let $minutes :=
+
+ if (count(index-of($format-tokens, "T")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),4,2)
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then string($sd-tokens[position() = 2])
+ else
+
+ if (count(index-of($format-tokens, "M")) != 0)
+ then string($sd-tokens[position() = index-of($format-tokens, "M")])
+
+ else "MND"
+
+ let $seconds :=
+ if (count(index-of($format-tokens, "T")) != 0)
+ then string($sd-tokens[position() = 3]) else
+
+ if (count(index-of($format-tokens, "X")) != 0) then substring(string(current-time()),7,2)
+ else
+
+ if (count(index-of($format-tokens, "r")) != 0)
+ then string($sd-tokens[position() = 3])
+ else
+
+ if (count(index-of($format-tokens, "R")) != 0) then "00"
+ else
+
+ if (count(index-of($format-tokens, "S")) != 0)
+ then $sd-tokens[position() = index-of($format-tokens, "S")] else
+
+ if (count(index-of($format-tokens, "e")) != 0)
+ then concat("0", string($sd-tokens[position() = index-of($format-tokens, "e")]))
+ else "00"
+
+ let $result :=
+
+ if (count(index-of($format-tokens, "Z")) != 0)
+ then
+ if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() = index-of($format-tokens, "Z")]]),1,1)='+')
+ then let $complement :=
+ if (number($minutes)+number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)) > 59)
+ then 1
+ else 0
+
+ let $dayscomplement :=
+ if (number($complement) + number($hours) + number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =index-of($format-tokens, "Z")]]),2,2)) >= 24)
+ then 1
+ else 0
+
+ let $monthscomplement :=
+ if(($dayscomplement + number($day) > 28) and (compare($month, '02') = 0) and (number($year) mod 4 != 0))
+ then 1
+ else
+ if(($dayscomplement + number($day) > 30) and ((compare($month, '04') = 0) or (compare($month, '06') = 0) or (compare($month, '09') = 0) or (compare($month, '11') = 0)))
+ then 1
+ else
+ if(($dayscomplement + number($day) > 31) and ((compare($month, '04') = 0) or (compare($month, '01') = 0) or (compare($month, '03') = 0) or (compare($month, '05') = 0) or (compare($month, '07') = 0) or (compare($month, '08') = 0) or (compare($month, '10') = 0) or (compare($month, '12') = 0)))
+ then 1
+ else
+ if(($dayscomplement + number($day) > 29) and (compare($month, '02') = 0) and (number($year) mod 4 = 0))
+ then 1
+ else 0
+
+ let $ryear :=
+ if ($monthscomplement + number($month) > 12)
+ then string(number($year) + 1)
+ else $year
+
+ let $daywcompl :=
+ if ($monthscomplement = 1)
+ then 1
+ else number($day) + $dayscomplement
+
+ let $monthwcompl :=
+ if($monthscomplement + number($month) <= 12)
+ then number($month) + $monthscomplement
+ else 1
+
+ let $rday :=
+ if (string-length(string($daywcompl)) = 1)
+ then concat ('0', string($daywcompl))
+ else string($daywcompl)
+
+ let $rmonth :=
+ if (string-length(string($monthwcompl)) = 1)
+ then concat ('0', string($monthwcompl))
+ else string($monthwcompl)
+
+ let $rhours :=
+ if (string-length(string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) +
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+
+ let $rminutes :=
+ if (string-length(string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes)+
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+
+
+ return concat($ryear, "-", $rmonth, "-", $rday, "T", $rhours, ":", $rminutes, ":", $seconds)
+ else
+
+ if (substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),1,1)='-')
+ then
+ let $complement :=
+ if (number($minutes)-number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)) < 0)
+ then -1
+ else 0
+
+ let $dayscomplement :=
+ if (number($complement) - number($hours) - number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position()=
+ index-of($format-tokens, "Z")]]),2,2)) < 0)
+ then -1
+ else 0
+
+ let $monthcomplement :=
+ if(number($day) + $dayscomplement < 1)
+ then -1
+ else 0
+
+ let $yearcomplement :=
+ if(number($month) + $monthcomplement< 1)
+ then -1
+ else 0
+
+ let $daywcompl :=
+ if ($monthcomplement = 0)
+ then number($day) + $dayscomplement
+ else
+ if ( (number($month) = 5) or (number($month) = 7) or (number($month) = 10) or (number($month) = 12))
+ then 30
+ else
+ if((number($month) = 4) or (number($month) = 6) or (number($month) = 9) or (number($month) = 11) or (number($month) = 2) or (number($month) = 1) or (number($month) = 8))
+ then 31
+ else
+ if((number($month) = 3) and (number($year) mod 4 != 0))
+ then 28
+ else
+ if((number($month) = 3) and (number($year) mod 4 = 0))
+ then 29
+ else number($day) + $dayscomplement
+
+ let $monthwcompl:=
+ if($yearcomplement = 0)
+ then number($month) + $monthcomplement
+ else 12
+
+ let $ryear :=
+ number($year) + $yearcomplement
+
+ let $rday :=
+ if (string-length(string($daywcompl)) = 1)
+ then concat ('0', string($daywcompl))
+ else string($daywcompl)
+
+ let $rmonth :=
+ if (string-length(string($monthwcompl)) = 1)
+ then concat ('0', string($monthwcompl))
+ else string($monthwcompl)
+
+ let $rhours :=
+ if( ((number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24) >= 0 )
+ then
+ if (string-length(string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else
+ if (string-length(string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24)) = 2)
+ then (string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (24 + number($complement) + -(number($hours) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)))) mod 24))
+
+ let $rminutes :=
+ if( ((number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60) >= 0 )
+ then
+ if (string-length(string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2))) mod 60))
+ else
+ if (string-length(string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)))) mod 60)) = 2)
+ then (string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),4,2)))) mod 60))
+ else concat("0",
+ string(
+ (60 - -(number($minutes) -
+ number(substring(string($timezoneDict//timeZone/@value[../@name=$sd-tokens[position() =
+ index-of($format-tokens, "Z")]]),2,2)))) mod 60))
+
+ return concat($ryear, "-", $rmonth, "-", $rday, "T", $rhours, ":", $rminutes, ":", $seconds)
+ else ()
+ else
+
+
+ if (count(index-of($format-tokens, "z")) != 0)
+ then if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='+')
+ then let $complement :=
+ if (number($minutes)+number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) > 59) then 1
+ else 0
+
+ let $dayscomplement :=
+ if (number($complement) + number($hours) + number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)) >= 24)
+ then 1
+ else 0
+
+ let $monthscomplement :=
+ if(($dayscomplement + number($day) > 28) and (compare($month, '02') = 0) and (number($year) mod 4 != 0))
+ then 1
+ else
+ if(($dayscomplement + number($day) > 30) and ((compare($month, '04') = 0) or (compare($month, '06') = 0) or (compare($month, '09') = 0) or (compare($month, '11') = 0)))
+ then 1
+ else
+ if(($dayscomplement + number($day) > 31) and ((compare($month, '04') = 0) or (compare($month, '01') = 0) or (compare($month, '03') = 0) or (compare($month, '05') = 0) or (compare($month, '07') = 0) or (compare($month, '08') = 0) or (compare($month, '10') = 0) or (compare($month, '12') = 0)))
+ then 1
+ else
+ if(($dayscomplement + number($day) > 29) and (compare($month, '02') = 0) and (number($year) mod 4 = 0))
+ then 1
+ else 0
+
+ let $ryear :=
+ if ($monthscomplement + number($month) > 12)
+ then string(number($year) + 1)
+ else $year
+
+ let $daywcompl :=
+ if ($monthscomplement = 1)
+ then 1
+ else number($day) + $dayscomplement
+
+ let $monthwcompl :=
+ if($monthscomplement + number($month) <= 12)
+ then number($month) + $monthscomplement
+ else 1
+
+ let $rday :=
+ if (string-length(string($daywcompl)) = 1)
+ then concat ('0', string($daywcompl))
+ else string($daywcompl)
+
+ let $rmonth :=
+ if (string-length(string($monthwcompl)) = 1)
+ then concat ('0', string($monthwcompl))
+ else string($monthwcompl)
+
+ let $rhours :=
+ if (string-length(string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) +
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+
+ let $rminutes :=
+ if (string-length(string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes)+
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+
+
+ return concat($ryear, "-", $rmonth, "-", $rday, "T", $rhours, ":", $rminutes, ":", $seconds)
+ else
+
+ if (substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),1,1)='-')
+ then
+ let $complement :=
+ if (number($minutes)-number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)) < 0) then -1
+ else 0
+
+ let $dayscomplement :=
+ if (number($complement) - number($hours) - number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)) < 0)
+ then -1
+ else 0
+
+ let $monthcomplement :=
+ if(number($day) + $dayscomplement< 1)
+ then -1
+ else 0
+
+ let $yearcomplement :=
+ if(number($month) + $monthcomplement< 1)
+ then -1
+ else 0
+
+ let $daywcompl :=
+ if ($monthcomplement = 0)
+ then number($day) + $dayscomplement
+ else
+ if ( (number($month) = 5) or (number($month) = 7) or (number($month) = 10) or (number($month) = 12))
+ then 30
+ else
+ if((number($month) = 4) or (number($month) = 6) or (number($month) = 9) or (number($month) = 11) or (number($month) = 2) or (number($month) = 1) or (number($month) = 8))
+ then 31
+ else
+ if((number($month) = 3) and (number($year) mod 4 != 0))
+ then 28
+ else
+ if((number($month) = 3) and (number($year) mod 4 = 0))
+ then 29
+ else number($day) + $dayscomplement
+
+ let $monthwcompl:=
+ if($yearcomplement = 0)
+ then number($month) + $monthcomplement
+ else 12
+
+ let $ryear :=
+ number($year) + $yearcomplement
+
+ let $rday :=
+ if (string-length(string($daywcompl)) = 1)
+ then concat ('0', string($daywcompl))
+ else string($daywcompl)
+
+ let $rmonth :=
+ if (string-length(string($monthwcompl)) = 1)
+ then concat ('0', string($monthwcompl))
+ else string($monthwcompl)
+
+ let $rhours :=
+ if( ((number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24) >= 0 )
+ then
+ if (string-length(string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else
+ if (string-length(string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24)) = 2)
+ then (string(
+ (24 + number($complement) + number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2))) mod 24))
+ else concat("0",
+ string(
+ (24 + number($complement) + -(number($hours) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 24))
+
+ let $rminutes :=
+ if( ((number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60) >= 0 )
+ then
+ if (string-length(string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60)) = 2)
+ then (string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else concat("0",
+ string(
+ (number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2))) mod 60))
+ else
+ if (string-length(string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60)) = 2)
+ then (string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),4,2)))) mod 60))
+ else concat("0",
+ string(
+ (60 - -(number($minutes) -
+ number(substring(string($sd-tokens[position() = index-of($format-tokens, "z")]),2,2)))) mod 60))
+
+ return concat($ryear, "-", $rmonth, "-", $rday, "T", $rhours, ":", $rminutes, ":", $seconds)
+ else ()
+ else
+ concat($year, "-", $month, "-", $day, "T", $hours, ":", $minutes, ":", $seconds)
+
+ return
+ normalization:check-dateTime($result)
+ else()
+};
+
+(:~
+ : Uses an address normalization Web service to convert a postal address given as input into a
+ : cannonical representation format.
+ :
+ :
+ : @param $addr A sequence of strings encoding an address, where each string in the sequence corresponds to a different component (e.g., street, city, country, etc.) of the address.
+ : @return A sequence of strings with the address encoded in a cannonical format, where each string in the sequence corresponds to a different component (e.g., street, city, country, etc.) of the address.
+ : @example test/Queries/data-cleaning/normalization/normalize-address.xq
+ :)
+declare %ann:nondeterministic function normalization:normalize-address ( $addr as xs:string* ) as xs:string* {
+
+ let $id := ""
+ let $url := "http://where.yahooapis.com/geocode?q="
+ let $q2 := string-join(for $i in $addr return translate($i," ","+"),",")
+ let $call := concat($url,$q2,"&appid=",$id)
+ let $doc := http:get-node($call)[2]
+ return distinct-values( (if (string-length($doc//xs:string(*:country)) > 0) then ($doc//xs:string(*:country)) else (),
+ if (string-length($doc//xs:string(*:state)) > 0) then ($doc//xs:string(*:state)) else (),
+ if (string-length($doc//xs:string(*:county)) > 0) then ($doc//xs:string(*:county)) else (),
+ if (string-length($doc//xs:string(*:city)) > 0) then ($doc//xs:string(*:city)) else (),
+ if (string-length($doc//xs:string(*:neighborhood)) > 0) then ($doc//xs:string(*:neighborhood)) else (),
+ if (string-length($doc//xs:string(*:street)) > 0) then ($doc//xs:string(*:street)) else (),
+ if (string-length($doc//xs:string(*:house)) > 0) then ($doc//xs:string(*:house)) else () ) )
+};
+
+(:~
+ : Uses an phone number normalization Web service to convert a phone number given as input into a
+ : cannonical representation.
+ :
+ : @param $phone A strings encoding a phone number.
+ : @return A strings with the phone number encoded in a cannonical format.
+ :
+ : <br/><br/><b> Attention : This function is still not implemented. </b> <br/>
+ :
+ :)
+declare function normalization:normalize-phone ( $addr as xs:string* ) as xs:string* {
+ ()
+};
+
+(:~
+ : Internal auxiliary function that returns an XML representation for a dictionary that contains the
+ : time-shift value associated to different time-zone abbreviations.
+ :)
+declare %private function normalization:timeZone-dictionary() as element(){
+ let $result :=
+ <dictionary>
+ <timeZone name="A" value="+0100"/>
+ <timeZone name="ADT" value="-0300"/>
+ <timeZone name="AFT" value="+0430"/>
+ <timeZone name="AKDT" value="-0800"/>
+ <timeZone name="AKST" value="-0900"/>
+ <timeZone name="ALMT" value="+0600"/>
+ <timeZone name="AMST" value="+0500"/>
+ <!--<timeZone name="AMST" value="-0300"/>-->
+ <timeZone name="AMT" value="+0400"/>
+ <!--<timeZone name="AMT" value="-0400"/>-->
+ <timeZone name="ANAST" value="+1200"/>
+ <timeZone name="ANAT" value="+1200"/>
+ <timeZone name="AQTT" value="+0500"/>
+ <timeZone name="ART" value="-0300"/>
+ <timeZone name="AST" value="-0400"/>
+ <timeZone name="AZOST" value="+0000"/>
+ <timeZone name="AZOT" value="-0100"/>
+ <timeZone name="AZST" value="+0500"/>
+ <timeZone name="AZT" value="+0400"/>
+ <timeZone name="B" value="+0200"/>
+ <timeZone name="BNT" value="+0800"/>
+ <timeZone name="BOT" value="-0400"/>
+ <timeZone name="BRST" value="-0200"/>
+ <timeZone name="BRT" value="-0300"/>
+ <!--<timeZone name="BST" value="+0600"/>-->
+ <timeZone name="BST" value="+0100"/>
+ <timeZone name="BTT" value="+0600"/>
+ <timeZone name="C" value="+0300"/>
+ <timeZone name="CAST" value="+0800"/>
+ <timeZone name="CAT" value="+0200"/>
+ <timeZone name="CCT" value="+0630"/>
+ <!--<timeZone name="CDT" value="+1030"/>-->
+ <!--<timeZone name="CDT" value="-0400"/>-->
+ <timeZone name="CDT" value="-0500"/>
+ <timeZone name="CEST" value="+0200"/>
+ <timeZone name="CET" value="+0100"/>
+ <timeZone name="CHADT" value="+1345"/>
+ <timeZone name="CHAST" value="+1245"/>
+ <timeZone name="CKT" value="-1000"/>
+ <timeZone name="CLST" value="-0300"/>
+ <timeZone name="CLT" value="-0400"/>
+ <timeZone name="COT" value="-0500"/>
+ <!--<timeZone name="CST" value="+0800"/>-->
+ <!--<timeZone name="CST" value="+0930"/>-->
+ <!--<timeZone name="CST" value="-0600"/>-->
+ <!--<timeZone name="CST" value="-0500"/>-->
+ <timeZone name="CST" value="-0600"/>
+ <timeZone name="CVT" value="-0100"/>
+ <timeZone name="CXT" value="+0700"/>
+ <timeZone name="ChST" value="+1000"/>
+ <timeZone name="D" value="+0400"/>
+ <timeZone name="DAVT" value="+0700"/>
+ <timeZone name="E" value=""/>
+ <timeZone name="EASST" value="-0500"/>
+ <timeZone name="EAST" value="-0600"/>
+ <timeZone name="EAT" value="+0300"/>
+ <timeZone name="ECT" value="-0500"/>
+ <!--<timeZone name="EDT" value="+1100"/>-->
+ <timeZone name="EDT" value="-0400"/>
+ <timeZone name="EEST" value="+0300"/>
+ <timeZone name="EET" value="+0200"/>
+ <timeZone name="EGST" value="+0000"/>
+ <timeZone name="EGT" value="-0100"/>
+ <timeZone name="EST" value="+1000"/>
+ <!--<timeZone name="EST" value="-0500"/>-->
+ <timeZone name="ET" value="-0500"/>
+ <timeZone name="F" value="+0600"/>
+ <timeZone name="FJST" value="+1300"/>
+ <timeZone name="FJT" value="+1200"/>
+ <timeZone name="FKST" value="-0300"/>
+ <timeZone name="FKT" value="-0400"/>
+ <timeZone name="FNT" value="-0200"/>
+ <timeZone name="G" value="+0700"/>
+ <timeZone name="GALT" value="-0600"/>
+ <timeZone name="GAMT" value="-0900"/>
+ <timeZone name="GET" value="+0400"/>
+ <timeZone name="GFT" value="-0300"/>
+ <timeZone name="GILT" value="+1200"/>
+ <timeZone name="GMT" value="+0000"/>
+ <timeZone name="GST" value="+0400"/>
+ <timeZone name="GYT" value="-0400"/>
+ <timeZone name="H" value="+0800"/>
+ <timeZone name="HAA" value="-0300"/>
+ <timeZone name="HAC" value="-0500"/>
+ <timeZone name="HADT" value="-0900"/>
+ <timeZone name="HAE" value="-0400"/>
+ <timeZone name="HAP" value="-0700"/>
+ <timeZone name="HAR" value="-0600"/>
+ <timeZone name="HAST" value="-1000"/>
+ <timeZone name="HAT" value="-0230"/>
+ <timeZone name="HAY" value="-0800"/>
+ <timeZone name="HKT" value="+0800"/>
+ <timeZone name="HLV" value="-0430"/>
+ <timeZone name="HNA" value="-0400"/>
+ <timeZone name="HNC" value="-0600"/>
+ <timeZone name="HNE" value="-0500"/>
+ <timeZone name="HNP" value="-0800"/>
+ <timeZone name="HNR" value="-0700"/>
+ <timeZone name="HNT" value="-0330"/>
+ <timeZone name="I" value="+0900"/>
+ <timeZone name="ICT" value="+0700"/>
+ <timeZone name="IDT" value="+0300"/>
+ <timeZone name="IOT" value="+0600"/>
+ <timeZone name="IRDT" value="+0430"/>
+ <timeZone name="IRKST" value="+0900"/>
+ <timeZone name="IRKT" value="+0800"/>
+ <timeZone name="IRST" value="+0330"/>
+ <!--<timeZone name="IST" value="+0200"/>-->
+ <timeZone name="IST" value="+0530"/>
+ <!--<timeZone name="IST" value="+0100"/>-->
+ <timeZone name="JST" value="+0900"/>
+ <timeZone name="K" value="+1000"/>
+ <timeZone name="KGT" value="+0600"/>
+ <timeZone name="KRAST" value="+0800"/>
+ <timeZone name="KRAT" value="+0700"/>
+ <timeZone name="KST" value="+0900"/>
+ <timeZone name="KUYT" value="+0400"/>
+ <timeZone name="L" value="+1100"/>
+ <timeZone name="LHDT" value="+1100"/>
+ <timeZone name="LHST" value="+10:30"/>
+ <timeZone name="LINT" value="+1400"/>
+ <timeZone name="M" value="+1200"/>
+ <timeZone name="MAGST" value="+1200"/>
+ <timeZone name="MAGT" value="+1100"/>
+ <timeZone name="MART" value="-0930"/>
+ <timeZone name="MAWT" value="+0500"/>
+ <timeZone name="MDT" value="-0600"/>
+ <timeZone name="MHT" value="+1200"/>
+ <timeZone name="MMT" value="+0630"/>
+ <timeZone name="MSD" value="+0400"/>
+ <timeZone name="MSK" value="+0300"/>
+ <timeZone name="MST" value="-0700"/>
+ <timeZone name="MUT" value="+0400"/>
+ <timeZone name="MVT" value="+0500"/>
+ <timeZone name="MYT" value="+0800"/>
+ <timeZone name="N" value="-0100"/>
+ <timeZone name="NCT" value="+1100"/>
+ <timeZone name="NDT" value="-0230"/>
+ <timeZone name="NFT" value="+1130"/>
+ <timeZone name="NOVST" value="+0700"/>
+ <timeZone name="NOVT" value="+0600"/>
+ <timeZone name="NPT" value="+0545"/>
+ <timeZone name="NST" value="-0330"/>
+ <timeZone name="NUT" value="-1100"/>
+ <timeZone name="NZDT" value="+1300"/>
+ <timeZone name="NZST" value="+1200"/>
+ <timeZone name="O" value="-0200"/>
+ <timeZone name="OMSST" value="+0700"/>
+ <timeZone name="OMST" value="+0600"/>
+ <timeZone name="P" value="-0300"/>
+ <timeZone name="PDT" value="-0700"/>
+ <timeZone name="PET" value="-0500"/>
+ <timeZone name="PETST" value="+1200"/>
+ <timeZone name="PETT" value="+1200"/>
+ <timeZone name="PGT" value="+1000"/>
+ <timeZone name="PHOT" value="+1300"/>
+ <timeZone name="PHT" value="+0800"/>
+ <timeZone name="PKT" value="+0500"/>
+ <timeZone name="PMDT" value="-0200"/>
+ <timeZone name="PMST" value="-0300"/>
+ <timeZone name="PONT" value="+1100"/>
+ <timeZone name="PST" value="-0800"/>
+ <timeZone name="PT" value="-0800"/>
+ <timeZone name="PWT" value="+0900"/>
+ <timeZone name="PYST" value="-0300"/>
+ <timeZone name="PYT" value="-0400"/>
+ <timeZone name="Q" value="-0400"/>
+ <timeZone name="R" value="-0500"/>
+ <timeZone name="RET" value="+0400"/>
+ <timeZone name="S" value="-0600"/>
+ <timeZone name="SAMT" value="+0400"/>
+ <timeZone name="SAST" value="+0200"/>
+ <timeZone name="SBT" value="+1100"/>
+ <timeZone name="SCT" value="+0400"/>
+ <timeZone name="SGT" value="+0800"/>
+ <timeZone name="SRT" value="-0300"/>
+ <timeZone name="SST" value="-1100"/>
+ <timeZone name="T" value="-0700"/>
+ <timeZone name="TAHT" value="-1000"/>
+ <timeZone name="TFT" value="+0500"/>
+ <timeZone name="TJT" value="+0500"/>
+ <timeZone name="TKT" value="-1000"/>
+ <timeZone name="TLT" value="+0900"/>
+ <timeZone name="TMT" value="+0500"/>
+ <timeZone name="TVT" value="+1200"/>
+ <timeZone name="U" value="-0800"/>
+ <timeZone name="ULAT" value="+0800"/>
+ <timeZone name="UTC" value="+0000"/>
+ <timeZone name="UYST" value="-0200"/>
+ <timeZone name="UYT" value="-0300"/>
+ <timeZone name="UZT" value="+0500"/>
+ <timeZone name="V" value="-0900"/>
+ <timeZone name="VET" value="-0430"/>
+ <timeZone name="VLAST" value="+1100"/>
+ <timeZone name="VLAT" value="+1000"/>
+ <timeZone name="VUT" value="+1100"/>
+ <timeZone name="W" value="-1000"/>
+ <timeZone name="WAST" value="+0200"/>
+ <timeZone name="WAT" value="+0100"/>
+ <timeZone name="WDT" value="+0900"/>
+ <timeZone name="WEST" value="+0100"/>
+ <timeZone name="WET" value="+0000"/>
+ <timeZone name="WFT" value="+1200"/>
+ <timeZone name="WGST" value="-0200"/>
+ <timeZone name="WGT" value="-0300"/>
+ <timeZone name="WIB" value="+0700"/>
+ <timeZone name="WIT" value="+0900"/>
+ <timeZone name="WITA" value="+0800"/>
+ <!--<timeZone name="WST" value="+0100"/>-->
+ <!--<timeZone name="WST" value="-1100"/>-->
+ <timeZone name="WST" value="+0800"/>
+ <timeZone name="WT" value="+0000"/>
+ <timeZone name="X" value="-1100"/>
+ <timeZone name="Y" value="-1200"/>
+ <timeZone name="YAKST" value="+1000"/>
+ <timeZone name="YAKT" value="+0900"/>
+ <timeZone name="YAPT" value="+1000"/>
+ <timeZone name="YEKST" value="+0600"/>
+ <timeZone name="YEKY" value="+0500"/>
+ <timeZone name="Z" value="+0000"/>
+ </dictionary>
+return $result
+};
+
+(:~
+ : Internal auxiliary function that returns an XML representation for a dictionary that contains a
+ : numeric value associated to different month name abbreviations.
+ :)
+declare %private function normalization:month-dictionary() as element(){
+let $dictionary :=
+<dictionary>
+ <month name="January" value="01">
+ <abrv>Jan</abrv>
+ <abrv>jan</abrv>
+ <abrv>JAN</abrv>
+ </month>
+ <month name="February" value="02">
+ <abrv>Feb</abrv>
+ <abrv>feb</abrv>
+ <abrv>FEB</abrv>
+ </month>
+ <month name="March" value="03">
+ <abrv>Mar</abrv>
+ <abrv>mar</abrv>
+ <abrv>MAR</abrv>
+ </month>
+ <month name="April" value="04">
+ <abrv>Apr</abrv>
+ <abrv>apr</abrv>
+ <abrv>APR</abrv>
+ </month>
+ <month name="May" value="05">
+ <abrv>MAY</abrv>
+ <abrv>may</abrv>
+ </month>
+ <month name="June" value="06">
+ <abrv>Jun</abrv>
+ <abrv>jun</abrv>
+ <abrv>JUN</abrv>
+ </month>
+ <month name="July" value="07">
+ <abrv>Jul</abrv>
+ <abrv>jul</abrv>
+ <abrv>JUL</abrv>
+ </month>
+ <month name="August" value="08">
+ <abrv>aug</abrv>
+ <abrv>Aug</abrv>
+ <abrv>AUG</abrv>
+ </month>
+ <month name="September" value="09">
+ <abrv>sep</abrv>
+ <abrv>Sep</abrv>
+ <abrv>SEP</abrv>
+ </month>
+ <month name="October" value="10">
+ <abrv>oct</abrv>
+ <abrv>OCT</abrv>
+ <abrv>Oct</abrv>
+ </month>
+ <month name="November" value="11">
+ <abrv>nov</abrv>
+ <abrv>Nov</abrv>
+ <abrv>NOV</abrv>
+ </month>
+ <month name="December" value="12">
+ <abrv>dec</abrv>
+ <abrv>Dec</abrv>
+ <abrv>DEC</abrv>
+ </month>
+</dictionary>
+return $dictionary
+};
+
+(:~
+ : Internal auxiliary function that checks if a string is in xs:dateTime format
+ :
+ :
+ : @param $dateTime The string representation for the dateTime.
+ : @return The dateTime string if it represents the xs:dateTime format.
+ :)
+declare %private function normalization:check-dateTime($dateTime as xs:string) as xs:string{
+ concat(string(year-from-dateTime(xs:dateTime($dateTime))), substring($dateTime,5))
+};
+
+(:~
+ : Internal auxiliary function that checks if a string is in xs:date format
+ :
+ :
+ : @param $dateTime The string representation for the date.
+ : @return The date string if it represents the xs:date format.
+ :)
+declare %private function normalization:check-date($date as xs:string) as xs:string{
+ concat(string(year-from-date(xs:date($date))), substring($date,5))
+};
+
+(:~
+ : Internal auxiliary function that checks if a string is in xs:time format
+ :
+ :
+ : @param $dateTime The string representation for the time.
+ : @return The time string if it represents the xs:time format.
+ :)
+declare %private function normalization:check-time($Time as xs:string) as xs:string{
+ if(string(hours-from-time(xs:time($Time))))
+ then $Time
+ else()
+};
+
+
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/phonetic-string-similarity.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,117 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides phonetic string similarity functions, comparing strings with basis on how they sound.
+ :
+ : These metrics are particularly effective in matching names, since names are often spelled in different
+ : ways that sound the same.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the Soundex key for a given string.
+ :
+ : <br/>
+ : Example usage : <pre> soundex-key("Robert") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> "R163" </pre>
+ :
+ : @param $s1 The string.
+ : @return The Soundex key for the given input string.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq
+ :)
+declare function simp:soundex-key ( $s1 as xs:string ) as xs:string {
+ let $group1 := replace(upper-case(substring($s1,2)),"[BFPV]","1")
+ let $groups := replace(replace(replace(replace(replace(replace($group1,"[CGJKQSXZ]","2"),"[DT]","3"),"L","4"),"[MN]","5"),"R","6"),"[^1-6]","")
+ let $merge := replace($groups,"([1-6])\1","$1")
+ let $result := concat(upper-case(substring($s1,1,1)), $merge)
+ return if (string-length($result) > 4 and matches($result,"([1-6])\1"))
+ then (simp:soundex-key($result))
+ else (substring(concat($result,"0000"),1,4))
+};
+
+(:~
+ : Checks if two strings have the same Soundex key.
+ :
+ : <br/>
+ : Example usage : <pre> soundex( "Robert" , "Rupert" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> true </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return Returns true if both strings have the same Soundex key and false otherwise.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq
+ :)
+declare function simp:soundex ( $s1 as xs:string, $s2 as xs:string ) as xs:boolean {
+ simp:soundex-key($s1) = simp:soundex-key($s2)
+};
+
+(:~
+ : Returns the Metaphone key for a given string.
+ : The Metaphone algorithm produces variable length keys as its output, as opposed to Soundex's fixed-length keys.
+ :
+ : <br/>
+ : Example usage : <pre> metaphone-key("ALEKSANDER") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> "ALKSNTR" </pre>
+ :
+ : @param $s1 The string.
+ : @return The Metaphone key for the given input string.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq
+ :)
+declare function simp:metaphone-key ( $s1 as xs:string ) as xs:string {
+ let $aux1 := replace(upper-case($s1),"([^C])\1","$1")
+ let $aux2 := if (matches($aux1,"$(([KGP]N)|([A]E)|([W]R))")) then (substring($aux1,2,string-length($aux1))) else ($aux1)
+ let $aux3 := replace(replace($aux2,"MB","M"),"B$","")
+ let $aux4 := replace(replace(replace(replace(replace($aux3,"CIA","XIA"),"SCH","SKH"),"CH","XH"),"C([IEY])","S$1"),"C","K")
+ let $aux5 := replace(replace($aux4,"DG([EYI])","JG$1"),"D","T")
+ let $aux6 := replace(replace($aux5,"GH([^AEIOU])","H$1"),"G(N(ED)?)$","$1")
+ let $aux7 := replace(replace(replace($aux6,"([^G]?)G([IEY])","$1J$2"),"([^G]?)G","$1K"),"GG","G")
+ let $aux8 := replace(replace(replace(replace($aux7,"([AEIOU])H([^AEIOU])","$1$2"),"CK","K"),"PH","F"),"Q","K")
+ let $aux9 := replace(replace(replace(replace(replace($aux8,"S(H|(IO)|(IA))","X$1"),"T((IO)|(IA))","X$1"),"TH","0"),"TCH","CH"),"V","F")
+ let $aux10 := replace(replace(replace(replace(replace(replace($aux9,"$WH","W"),"W([^AEIOU])","$1"),"$X","S"),"X","KS"),"Y([^AEIOU])","$1"),"Z","S")
+ return concat(substring($aux10,1,1) , replace(substring($aux10,2,string-length($aux10)) , "[AEIOU]", ""))
+};
+
+(:~
+ : Checks if two strings have the same Metaphone key.
+ :
+ : <br/>
+ : Example usage : <pre> metaphone("ALEKSANDER", "ALEXANDRE") </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> true </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @return Returns true if both strings have the same Metaphone key and false otherwise.
+ : @example test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq
+ :)
+declare function simp:metaphone ( $s1 as xs:string, $s2 as xs:string ) as xs:boolean {
+ simp:metaphone-key($s1) = simp:metaphone-key($s2)
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/set-similarity.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,150 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides similarity functions for comparing sets of XML
+ : nodes (e.g., sets of XML elements, attributes or atomic values).
+ :
+ : These functions are particularly useful for matching near duplicate sets of XML nodes.
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the union between two sets, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> deep-union ( ( "a", "b", "c") , ( "a", "a", <d/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a", "b", "c", <d/> ) </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The union of both sets.
+ : @example test/Queries/data-cleaning/set-similarity/deep-union.xq
+ :)
+declare function set:deep-union ( $s1 , $s2 ) {
+ let $s := ( $s1 , $s2 )
+ for $a at $apos in $s
+ where every $ba in subsequence($s, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ return $a
+};
+
+(:~
+ : Returns the intersection between two sets, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> deep-intersect ( ( "a", "b", "c") , ( "a", "a", <d/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a") </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The intersection of both sets.
+ : @example test/Queries/data-cleaning/set-similarity/deep-intersect.xq
+ :)
+declare function set:deep-intersect ( $s1 , $s2 ) {
+ for $a at $apos in $s1
+ let $t1 := every $ba in subsequence($s1, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ let $t2 := some $bb in $s2 satisfies deep-equal($bb,$a)
+ where $t1 and $t2
+ return $a
+};
+
+(:~
+ : Removes exact duplicates from a set, using the deep-equal() function to compare the XML nodes from the sets.
+ :
+ : <br/>
+ : Example usage : <pre> distinct ( ( "a", "a", <b/> ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("a", <b/> ) </pre>
+ :
+ : @param $s A set.
+ : @return The set provided as input without the exact duplicates (i.e., returns the distinct nodes from the set provided as input).
+ : @example test/Queries/data-cleaning/set-similarity/distinct.xq
+ :)
+declare function set:distinct ( $s ) {
+ for $a at $apos in $s
+ where every $ba in subsequence($s, 1, $apos - 1) satisfies not(deep-equal($ba,$a))
+ return $a
+};
+
+(:~
+ : Returns the overlap coefficient between two sets of XML nodes.
+ : The overlap coefficient is defined as the shared information between the input sets
+ : (i.e., the size of the intersection) over the size of the smallest input set.
+ :
+ : <br/>
+ : Example usage : <pre> overlap ( ( "a", "b", <c/> ) , ( "a", "a", "b" ) ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The overlap coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/overlap.xq
+ :)
+declare function set:overlap ( $s1 , $s2 ) as xs:double {
+ count( set:deep-intersect($s1, $s2) ) div min((count(set:distinct($s1)) , count(set:distinct($s2))))
+};
+
+(:~
+ : Returns the Dice similarity coefficient between two sets of XML nodes.
+ : The Dice coefficient is defined as defined as twice the shared information between the input sets
+ : (i.e., the size of the intersection) over the sum of the cardinalities for the input sets.
+ :
+ : <br/>
+ : Example usage : <pre> dice ( ( "a", "b", <c/> ) , ( "a", "a", "d") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The Dice similarity coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/dice.xq
+ :)
+declare function set:dice ( $s1 , $s2 ) as xs:double {
+ 2 * count( set:deep-intersect($s1,$s2) ) div ( count(set:distinct($s1)) + count(set:distinct($s2)) )
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between two sets of XML nodes.
+ : The Jaccard coefficient is defined as the size of the intersection divided by the size of the
+ : union of the input sets.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard ( ( "a", "b", <c/> ) , ( "a", "a", "d") ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.25 </pre>
+ :
+ : @param $s1 The first set.
+ : @param $s2 The second set.
+ : @return The Jaccard similarity coefficient between the two sets.
+ : @example test/Queries/data-cleaning/set-similarity/jaccard.xq
+ :)
+declare function set:jaccard ( $s1 , $s2 ) as xs:double {
+ count( set:deep-intersect($s1,$s2) ) div count( set:deep-union($s1,$s2) )
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq'
--- src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/token-based-string-similarity.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,249 @@
+xquery version "1.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+ :)
+
+(:~
+ : This library module provides token-based string similarity functions that view strings
+ : as sets or multi-sets of tokens and use set-related properties to compute similarity scores.
+ : The tokens correspond to groups of characters extracted from the strings being compared, such as
+ : individual words or character n-grams.
+ :
+ : These functions are particularly useful for matching near duplicate strings in cases where
+ : typographical conventions often lead to rearrangement of words (e.g., "John Smith" versus "Smith, John").
+ :
+ : The logic contained in this module is not specific to any particular XQuery implementation,
+ : although the module requires the trigonometic functions of XQuery 1.1 or a math extension
+ : function such as sqrt($x as numeric) for computing the square root.
+ :
+ : @author Bruno Martins
+ : @project data processing/data cleaning
+ :)
+
+module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+(: In the QizX or Saxon XQuery engines, it is possible to call external functions from the Java math library :)
+(: declare namespace math = "java:java.lang.Math"; :)
+declare namespace math = "http://www.w3.org/2005/xpath-functions/math";
+
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "2.0";
+
+(:~
+ : Returns the individual character n-grams forming a string.
+ :
+ : <br/>
+ : Example usage : <pre> ngrams("FLWOR", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> ("_F" , "FL" , "LW" , "WO" , "LW" , "WO" , "OR" , "R_") </pre>
+ :
+ : @param $s The input string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The sequence of strings with the extracted n-grams.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq
+ :)
+declare function simt:ngrams ( $s as xs:string, $n as xs:integer ) as xs:string* {
+ let $pad := '_'
+ return
+ ( for $a in 1 to $n
+ let $apad := string-join( for $aux in $a + 1 to $n return $pad , '' )
+ return concat( $apad , replace(substring($s,1,$a) , "_", "\\_") ) ,
+
+ for $b in $n + 2 to string-length($s) return replace(substring($s,$b - $n, $n), "_", "\\_") ,
+
+ for $c in string-length($s) - (if ($n = 1) then (-1) else ($n)) - 1 to string-length($s)
+ let $cpad := string-join( for $aux in string-length($s) - $c + 2 to $n return $pad , '' )
+ return concat(replace(substring($s, $c, $n), "_", "\\_"), $cpad )
+ )
+};
+
+(:~
+ : Auxiliary function for computing the cosine similarity coefficient between strings,
+ : using stringdescriptors based on sets of character n-grams or sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> cosine( ("aa","bb") , ("bb","aa")) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 1.0 </pre>
+ :
+ : @param $desc1 The descriptor for the first string.
+ : @param $desc2 The descriptor for the second string.
+ : @return The cosine similarity coefficient between the descriptors for the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine.xq
+ :)
+declare function simt:cosine ( $desc1 as xs:string*, $desc2 as xs:string* ) as xs:double {
+ let $vocab := distinct-values( ($desc1, $desc2) )
+ let $freq1 := for $v1 in $vocab return count($desc1[.=$v1])
+ let $freq2 := for $v2 in $vocab return count($desc2[.=$v2])
+ let $freq1pow := for $f1 in $freq1 return $f1 * $f1
+ let $freq2pow := for $f2 in $freq2 return $f2 * $f2
+ let $mult := for $freq at $pos in $freq1 return $freq * $freq2[$pos]
+ return sum($mult) div (math:sqrt(sum($freq1pow)) * math:sqrt(sum($freq2pow)))
+};
+
+(:~
+ : Returns the Dice similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> dice-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4615384615384616 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The Dice similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq
+ :)
+declare function simt:dice-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:dice(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the overlap similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> overlap-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The overlap similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq
+ :)
+declare function simt:overlap-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:overlap(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between sets of character n-grams extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.3 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The Jaccard similarity coefficient between the sets of character n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq
+ :)
+declare function simt:jaccard-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ set:jaccard(simt:ngrams($s1,$n),simt:ngrams($s2,$n))
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of character n-grams extracted from two strings.
+ : The n-grams from each string are weighted according to their occurence frequency (i.e., weighted according to
+ : the term-frequency heuristic from Information Retrieval).
+ :
+ : <br/>
+ : Example usage : <pre> cosine-ngrams("DWAYNE", "DUANE", 2 ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.2401922307076307 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $n The number of characters to consider when extracting n-grams.
+ : @return The cosine similarity coefficient between the sets n-grams extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq
+ :)
+declare function simt:cosine-ngrams ( $s1 as xs:string, $s2 as xs:string, $n as xs:integer ) as xs:double {
+ let $ngrams1 := simt:ngrams($s1,$n)
+ let $ngrams2 := simt:ngrams($s2,$n)
+ return simt:cosine($ngrams1,$ngrams2)
+};
+
+(:~
+ : Returns the Dice similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> dice-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.4 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The Dice similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq
+ :)
+declare function simt:dice-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:dice( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the overlap similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> overlap-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.5 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The overlap similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq
+ :)
+declare function simt:overlap-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:overlap( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the Jaccard similarity coefficient between sets of tokens extracted from two strings.
+ :
+ : <br/>
+ : Example usage : <pre> jaccard-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.25 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The Jaccard similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq
+ :)
+declare function simt:jaccard-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ set:jaccard( tokenize($s1,$r) , tokenize($s2,$r) )
+};
+
+(:~
+ : Returns the cosine similarity coefficient between sets of tokens extracted from two strings. The tokens
+ : from each string are weighted according to their occurence frequency (i.e., weighted according to the
+ : term-frequency heuristic from Information Retrieval).
+ :
+ : <br/>
+ : Example usage : <pre> cosine-tokens("The FLWOR Foundation", "FLWOR Found.", " +" ) </pre>
+ : <br/>
+ : The function invocation in the example above returns : <pre> 0.408248290463863 </pre>
+ :
+ : @param $s1 The first string.
+ : @param $s2 The second string.
+ : @param $r A regular expression forming the delimiter character(s) which mark the boundaries between adjacent tokens.
+ : @return The cosine similarity coefficient between the sets tokens extracted from the two strings.
+ : @example test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq
+ :)
+declare function simt:cosine-tokens ( $s1 as xs:string, $s2 as xs:string, $r as xs:string ) as xs:double {
+ let $tokens1 := tokenize($s1,$r)
+ let $tokens2 := tokenize($s2,$r)
+ return simt:cosine($tokens1,$tokens2)
+};
=== added file 'src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd'
--- src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd 1970-01-01 00:00:00 +0000
+++ src/com/zorba-xquery/www/modules/data-cleaning/whitepages_schema.xsd 2011-12-22 13:29:42 +0000
@@ -0,0 +1,343 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified" attributeFormDefault="qualified"
+ targetNamespace="http://api.whitepages.com/schema/" xmlns:wp="http://api.whitepages.com/schema/">
+<!--
+:: Copyright 2006-2008 The FLWOR Foundation.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+:: http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+::
+-->
+
+ <xs:element name="wp">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:result"/>
+ <xs:element minOccurs="0" ref="wp:errormessages"/>
+ <xs:element minOccurs="0" ref="wp:meta"/>
+ <xs:element minOccurs="0" ref="wp:listings"/>
+ <xs:element minOccurs="0" ref="wp:options"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="result">
+ <xs:complexType>
+ <xs:attribute name="code" use="required" type="wp:responsecode"/>
+ <xs:attribute name="message"/>
+ <xs:attribute name="type" use="required" type="wp:responsetype"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:simpleType name="responsetype">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="success"/>
+ <xs:enumeration value="error"/>
+ </xs:restriction>
+ </xs:simpleType>
+ <xs:simpleType name="responsecode">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="Found Data"/>
+ <xs:enumeration value="No Data Found"/>
+ <xs:enumeration value="Truncated Data"/>
+ <xs:enumeration value="Error"/>
+ <xs:enumeration value="Server Error"/>
+ <xs:enumeration value="Invalid Input"/>
+ <xs:enumeration value="Mismatched Input"/>
+ <xs:enumeration value="Missing Input"/>
+ <xs:enumeration value="Refine Input"/>
+ </xs:restriction>
+ </xs:simpleType>
+ <xs:element name="errormessages">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:message"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="message" type="xs:string"/>
+ <xs:element name="meta">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" ref="wp:linkexpiration"/>
+ <xs:element ref="wp:recordrange"/>
+ <xs:element ref="wp:apiversion"/>
+ <xs:element ref="wp:searchid"/>
+ <xs:element ref="wp:searchlinks"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="linkexpiration" type="xs:date"/>
+ <xs:element name="recordrange">
+ <xs:complexType>
+ <xs:attribute name="lastrecord" use="required" type="xs:integer"/>
+ <xs:attribute name="firstrecord" use="required" type="xs:integer"/>
+ <xs:attribute name="totalavailable" use="required" type="xs:integer"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="apiversion" type="xs:string"/>
+ <xs:element name="searchid" type="xs:string"/>
+ <xs:element name="searchlinks">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:link"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="listings">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:listing"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="listing">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:people"/>
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:business"/>
+ <xs:element minOccurs="0" ref="wp:displayname"/>
+ <xs:element minOccurs="0" ref="wp:tagline"/>
+ <xs:element minOccurs="0" ref="wp:phonenumbers"/>
+ <xs:element minOccurs="0" ref="wp:address"/>
+ <xs:element minOccurs="0" ref="wp:geodata"/>
+ <xs:element minOccurs="0" ref="wp:listingmeta"/>
+ </xs:sequence>
+ <xs:attribute name="sponsored" type="xs:boolean"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="people">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:person"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="person" type="wp:personType"/>
+ <xs:complexType name="personType">
+ <xs:sequence>
+ <xs:element ref="wp:firstname"/>
+ <xs:element minOccurs="0" ref="wp:middlename"/>
+ <xs:element ref="wp:lastname"/>
+ <xs:element minOccurs="0" ref="wp:suffix"/>
+ </xs:sequence>
+ <xs:attribute name="rank" use="required" type="wp:rank"/>
+ </xs:complexType>
+
+ <xs:simpleType name="rank">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="primary"/>
+ <xs:enumeration value="secondary"/>
+ <xs:enumeration value="tertiary"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:element name="firstname" type="xs:string"/>
+ <xs:element name="middlename" type="xs:string"/>
+ <xs:element name="lastname" type="xs:string"/>
+ <xs:element name="suffix" type="xs:string"/>
+ <xs:element name="business">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:businessname"/>
+ </xs:sequence>
+ <xs:attribute name="rank" use="required" type="wp:rank"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="businessname" type="xs:string"/>
+ <xs:element name="displayname" type="xs:string"/>
+ <xs:element name="tagline" type="xs:string"/>
+ <xs:element name="phonenumbers">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:phone"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="phone">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:fullphone"/>
+ <xs:element minOccurs="0" ref="wp:areacode"/>
+ <xs:element ref="wp:exchange"/>
+ <xs:element ref="wp:linenumber"/>
+ <xs:element minOccurs="0" ref="wp:carrier"/>
+ </xs:sequence>
+ <xs:attribute name="rank" use="required" type="wp:rank"/>
+ <xs:attribute name="type" use="required" type="wp:listingtype"/>
+ <xs:attribute name="carrier_only" type="xs:boolean"/>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:simpleType name="listingtype">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="work"/>
+ <xs:enumeration value="home"/>
+ <xs:enumeration value="business"/>
+ <xs:enumeration value="government"/>
+ <xs:enumeration value="mobile"/>
+ <xs:enumeration value="landline"/>
+ <xs:enumeration value="pager"/>
+ <xs:enumeration value="satellite"/>
+ <xs:enumeration value="unknown"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:element name="fullphone" type="xs:string"/>
+ <xs:element name="areacode" type="xs:string"/>
+ <xs:element name="exchange" type="xs:string"/>
+ <xs:element name="linenumber" type="xs:string"/>
+ <xs:element name="carrier" type="xs:string"/>
+ <xs:element name="address">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" ref="wp:fullstreet"/>
+ <xs:element minOccurs="0" ref="wp:house"/>
+ <xs:element minOccurs="0" ref="wp:predir"/>
+ <xs:element minOccurs="0" ref="wp:street"/>
+ <xs:element minOccurs="0" ref="wp:postdir"/>
+ <xs:element minOccurs="0" ref="wp:streettype"/>
+ <xs:element minOccurs="0" ref="wp:aptnumber"/>
+ <xs:element minOccurs="0" ref="wp:apttype"/>
+ <xs:element minOccurs="0" ref="wp:city"/>
+ <xs:element minOccurs="0" ref="wp:state"/>
+ <xs:element minOccurs="0" ref="wp:zip"/>
+ <xs:element minOccurs="0" ref="wp:zip4"/>
+ <xs:element minOccurs="0" ref="wp:country"/>
+ </xs:sequence>
+ <xs:attribute name="deliverable" use="required" type="xs:boolean"/>
+ </xs:complexType>
+ </xs:element>
+
+ <xs:simpleType name="country">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="US"/>
+ <xs:enumeration value="CA"/>
+ </xs:restriction>
+ </xs:simpleType>
+
+ <xs:element name="fullstreet" type="xs:string"/>
+ <xs:element name="house" type="xs:string"/>
+ <xs:element name="predir" type="xs:string"/>
+ <xs:element name="street" type="xs:string"/>
+ <xs:element name="postdir" type="xs:string"/>
+ <xs:element name="streettype" type="xs:string"/>
+ <xs:element name="aptnumber" type="xs:string"/>
+ <xs:element name="apttype" type="xs:string"/>
+ <xs:element name="city" type="xs:string"/>
+ <xs:element name="state" type="xs:string"/>
+ <xs:element name="zip" type="xs:string"/>
+ <xs:element name="zip4" type="xs:string"/>
+ <xs:element name="country" type="wp:country"/>
+ <xs:element name="geodata">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:geoprecision"/>
+ <xs:element ref="wp:latitude"/>
+ <xs:element ref="wp:longitude"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="geoprecision" type="xs:integer"/>
+ <xs:element name="latitude" type="xs:string"/>
+ <xs:element name="longitude" type="xs:string"/>
+ <xs:element name="previous_locations">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="previous_location" maxOccurs="unbounded" type="wp:locationType"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:complexType name="locationType">
+ <xs:sequence>
+ <xs:element name="city" type="xs:string"/>
+ <xs:element name="state" type="xs:string"/>
+ <xs:element name="year" type="xs:string"/>
+ </xs:sequence>
+ </xs:complexType>
+ <xs:element name="listingmeta">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" ref="wp:lastvalidated"/>
+ <xs:element minOccurs="0" ref="wp:type"/>
+ <xs:element minOccurs="0" ref="wp:sponsor"/>
+ <xs:element minOccurs="0" ref="wp:recordid"/>
+ <xs:element ref="wp:moreinfolinks"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="lastvalidated" type="xs:string"/>
+ <xs:element name="sponsor" type="xs:string"/>
+ <xs:element name="recordid" type="xs:string"/>
+ <xs:element name="type" type="wp:listingtype"/>
+ <xs:element name="moreinfolinks">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element minOccurs="0" maxOccurs="unbounded" ref="wp:link"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="options">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element maxOccurs="unbounded" ref="wp:cityoption" minOccurs="0" />
+ <xs:element maxOccurs="unbounded" ref="wp:categoryoption" minOccurs="0" />
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="cityoption">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:refinesearchurl"/>
+ </xs:sequence>
+ <xs:attribute name="city" use="required" type="xs:string"/>
+ <xs:attribute name="country" use="required" type="wp:country"/>
+ <xs:attribute name="state" use="required" type="xs:string"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="categoryoption">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element ref="wp:refinesearchurl"/>
+ </xs:sequence>
+ <xs:attribute name="total" use="required" type="xs:string"/>
+ <xs:attribute name="description" use="required" type="xs:string"/>
+ </xs:complexType>
+ </xs:element>
+ <xs:element name="refinesearchurl" type="xs:anyURI"/>
+ <xs:element name="link">
+ <xs:complexType>
+ <xs:simpleContent>
+ <xs:extension base="xs:anyURI">
+ <xs:attribute name="linktext" use="required" type="xs:string"/>
+ <xs:attribute name="type" use="required" type="wp:linktype"/>
+ </xs:extension>
+ </xs:simpleContent>
+ </xs:complexType>
+ </xs:element>
+ <xs:simpleType name="linktype">
+ <xs:restriction base="xs:string">
+ <xs:enumeration value="drivingdirections"/>
+ <xs:enumeration value="findneighbors"/>
+ <xs:enumeration value="homepage"/>
+ <xs:enumeration value="viewdetails"/>
+ <xs:enumeration value="viewmap"/>
+
+ <xs:enumeration value="mapareacode"/>
+
+ <xs:enumeration value="allresults"/>
+ <xs:enumeration value="mapallresults"/>
+ <xs:enumeration value="self"/>
+ <xs:enumeration value="worklistings"/>
+
+ <xs:enumeration value="viewsearchsuggestions"/>
+ </xs:restriction>
+ </xs:simpleType>
+</xs:schema>
\ No newline at end of file
=== added directory 'test'
=== renamed directory 'test' => 'test.moved'
=== added directory 'test/ExpQueryResults'
=== added directory 'test/ExpQueryResults/data-cleaning'
=== added directory 'test/ExpQueryResults/data-cleaning/character-based-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/edit-distance.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+2
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro-winkler.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.8577777777777778
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/jaro.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5853174603174604
=== added file 'test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res'
--- test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/character-based-string-similarity/needleman-wunsch.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0
=== added directory 'test/ExpQueryResults/data-cleaning/consolidation'
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-attributes.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<c/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-attributes.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<c/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-elements.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-distinct-nodes.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-elements.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-nodes.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-similar-edit-distance.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+eeefff
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/least-tokens.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/leastfrequent_1.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+b
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/longest_1.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+aaa
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/matching_1.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a A b c AAA d
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-attributes.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a att1="a1" att2="a2"/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-attributes.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a att1="a1" att2="a2" att3="a3"/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-elements.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a><b/><c/><d/></a>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-distinct-nodes.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-elements.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-frequent.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-nodes.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+<a><b/></a>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-similar-edit-distance.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+aaabbb
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/most-tokens.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a b c
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/shortest_1.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res'
--- test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/consolidation/superstring_1.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+aaa bbb
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/conversion'
=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-geocode.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Portugal Lisbon praça Marquês de Pombal
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-phone.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+4610 Harrison Bend Rd, Loudon, TN, US
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/address-from-user.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+222 E 53rd St, Los Angeles, CA, US
=== added file 'test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/currency-convert.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.747887218607434
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/geocode-from-address.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+38 -10
=== added file 'test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/phone-from-address.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+(520) 824-3160 (520) 824-3160
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/phone-from-user.xml.res'
=== added file 'test/ExpQueryResults/data-cleaning/conversion/unit-convert.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/unit-convert.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/unit-convert.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1.609344
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/user-from-address.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Sunizona Greenhouses Inc Stan Smith
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res'
--- test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/conversion/user-from-phone.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Gene Simpson Homer V Simpson Homer Simpson Sue M Simpson
=== added directory 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.907838383838384
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res'
--- test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/normalization'
=== added file 'test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/normalize-address.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Portugal Lisbon Marquês de Pombal
=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-date.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/to-date.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/to-date.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+2002-10-24
=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-dateTime.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/to-dateTime.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/to-dateTime.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+2002-10-24T21:22:00
=== added file 'test/ExpQueryResults/data-cleaning/normalization/to-time.xml.res'
--- test/ExpQueryResults/data-cleaning/normalization/to-time.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/normalization/to-time.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+09:10:00
=== added directory 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone-key.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+ALKSNTR
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/metaphone.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+ALKSNTR
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex-key.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+R163
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res'
--- test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/phonetic-string-similarity/soundex.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+true
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/set-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/deep-intersect.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/deep-union.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a b c<d/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/dice.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.4
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/distinct.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+a<b/>
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/jaccard.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.25
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res'
--- test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/set-similarity/overlap.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file
=== added directory 'test/ExpQueryResults/data-cleaning/token-based-string-similarity'
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-ngrams.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.2401922307076307
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine-tokens.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.408248290463863
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/cosine.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+1
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-ngrams.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.4615384615384616
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/dice-tokens.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.4
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-ngrams.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.3
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/jaccard-tokens.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.25
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/ngrams.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+_F FL LW WO LW WO OR R_
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-ngrams.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file
=== added file 'test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res'
--- test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/data-cleaning/token-based-string-similarity/overlap-tokens.xml.res 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+0.5
\ No newline at end of file
=== added directory 'test/Queries'
=== added directory 'test/Queries/data-cleaning'
=== added directory 'test/Queries/data-cleaning/character-based-string-similarity'
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/edit-distance.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+
+simc:edit-distance("FLWOR", "FLOWER")
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/jaro-winkler.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+
+simc:jaro-winkler("DWAYNE", "DUANE", 4, 0.1 )
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/jaro.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/jaro.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/jaro.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+
+simc:jaro("FLWOR Found.", "FLWOR Foundation")
=== added file 'test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq'
--- test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/character-based-string-similarity/needleman-wunsch.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simc = "http://www.zorba-xquery.com/modules/data-cleaning/character-based-string-similarity";
+
+simc:needleman-wunsch("KAK", "KQRK", 1, 1)
=== added directory 'test/Queries/data-cleaning/consolidation'
=== added file 'test/Queries/data-cleaning/consolidation/least-attributes.xq'
--- test/Queries/data-cleaning/consolidation/least-attributes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-attributes.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-attributes.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-distinct-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-elements.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-elements.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-elements.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-distinct-elements( ( <a><b/></a>, <b><c/></b>, <d/>) )
=== added file 'test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq'
--- test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-distinct-nodes.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-distinct-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) )
=== added file 'test/Queries/data-cleaning/consolidation/least-elements.xq'
--- test/Queries/data-cleaning/consolidation/least-elements.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-elements.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-elements( ( <a><b/></a>, <b><c/></b>, <d/>) )
=== added file 'test/Queries/data-cleaning/consolidation/least-nodes.xq'
--- test/Queries/data-cleaning/consolidation/least-nodes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-nodes.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-nodes( ( <a><b/></a>, <b><c/></b>, <d/>) )
=== added file 'test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq'
--- test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-similar-edit-distance.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" )
=== added file 'test/Queries/data-cleaning/consolidation/least-tokens.xq'
--- test/Queries/data-cleaning/consolidation/least-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/least-tokens.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-tokens( ( "a b c", "a b", "a"), " +" )
=== added file 'test/Queries/data-cleaning/consolidation/leastfrequent_1.xq'
--- test/Queries/data-cleaning/consolidation/leastfrequent_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/leastfrequent_1.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:least-frequent( ( "a", "a", "b") )
=== added file 'test/Queries/data-cleaning/consolidation/longest_1.xq'
--- test/Queries/data-cleaning/consolidation/longest_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/longest_1.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:longest( ( "a", "aa", "aaa") )
=== added file 'test/Queries/data-cleaning/consolidation/matching_1.xq'
--- test/Queries/data-cleaning/consolidation/matching_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/matching_1.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:matching( ( "a A b", "c AAA d", "e BB f"), "A+" )
=== added file 'test/Queries/data-cleaning/consolidation/most-attributes.xq'
--- test/Queries/data-cleaning/consolidation/most-attributes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-attributes.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-attributes( ( <a att1="a1" att2="a2"/>, <b att1="a1" />, <c/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-attributes.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-distinct-attributes( ( <a att1="a1" att2="a2" att3="a3"/>, <a att1="a1" att2="a2"><b att2="a2" /></a>, <c/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-elements.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-elements.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-elements.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-distinct-elements( ( <a><b/><c/><d/></a>, <a><b/><b/><c/></a>, <a/> ) )
=== added file 'test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq'
--- test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-distinct-nodes.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-distinct-nodes( ( <a><b/></a>, <a><a/></a>, <b/>) )
=== added file 'test/Queries/data-cleaning/consolidation/most-elements.xq'
--- test/Queries/data-cleaning/consolidation/most-elements.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-elements.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-elements( ( <a><b/></a>, <a/>, <b/>) )
=== added file 'test/Queries/data-cleaning/consolidation/most-frequent.xq'
--- test/Queries/data-cleaning/consolidation/most-frequent.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-frequent.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-frequent( ( "a", "a", "b") )
=== added file 'test/Queries/data-cleaning/consolidation/most-nodes.xq'
--- test/Queries/data-cleaning/consolidation/most-nodes.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-nodes.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-nodes( ( <a><b/></a>, <a/>, <b/>) )
=== added file 'test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq'
--- test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-similar-edit-distance.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-similar-edit-distance( ( "aaabbbccc", "aaabbb", "eeefff" ), "aaab" )
=== added file 'test/Queries/data-cleaning/consolidation/most-tokens.xq'
--- test/Queries/data-cleaning/consolidation/most-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/most-tokens.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:most-tokens( ( "a b c", "a b", "a"), " +" )
=== added file 'test/Queries/data-cleaning/consolidation/shortest_1.xq'
--- test/Queries/data-cleaning/consolidation/shortest_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/shortest_1.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:shortest( ( "a", "aa", "aaa") )
=== added file 'test/Queries/data-cleaning/consolidation/superstring_1.xq'
--- test/Queries/data-cleaning/consolidation/superstring_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/consolidation/superstring_1.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace con = "http://www.zorba-xquery.com/modules/data-cleaning/consolidation";
+
+con:superstring( ( "aaa bbb ccc", "aaa bbb", "aaa ddd", "eee fff" ) )
=== added directory 'test/Queries/data-cleaning/conversion'
=== added file 'test/Queries/data-cleaning/conversion/address-from-geocode.xq'
--- test/Queries/data-cleaning/conversion/address-from-geocode.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-geocode.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:address-from-geocode ( 38.725735 , -9.15021 )
=== added file 'test/Queries/data-cleaning/conversion/address-from-phone.xq'
--- test/Queries/data-cleaning/conversion/address-from-phone.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-phone.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:address-from-phone ('8654582358')
=== added file 'test/Queries/data-cleaning/conversion/address-from-user.xq'
--- test/Queries/data-cleaning/conversion/address-from-user.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/address-from-user.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:address-from-user ('Maria Lurdes')
=== added file 'test/Queries/data-cleaning/conversion/currency-convert.xq'
--- test/Queries/data-cleaning/conversion/currency-convert.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/currency-convert.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:currency-convert ( 1, "USD", "EUR", "2011-01-18" )
=== added file 'test/Queries/data-cleaning/conversion/geocode-from-address.xq'
--- test/Queries/data-cleaning/conversion/geocode-from-address.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/geocode-from-address.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,5 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+let $geocode := conversion:geocode-from-address ( ("Lisboa", "Portugal") )
+for $result in $geocode
+return floor($result)
=== added file 'test/Queries/data-cleaning/conversion/phone-from-address.xq'
--- test/Queries/data-cleaning/conversion/phone-from-address.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/phone-from-address.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:phone-from-address('5655 E Gaskill Rd, Willcox, AZ, US')
=== added file 'test/Queries/data-cleaning/conversion/phone-from-user.xq'
--- test/Queries/data-cleaning/conversion/phone-from-user.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/phone-from-user.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:phone-from-user ('Maria Lurdes')
=== added file 'test/Queries/data-cleaning/conversion/unit-convert.xq'
--- test/Queries/data-cleaning/conversion/unit-convert.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/unit-convert.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:unit-convert ( 1 , "Distance", "mile", "kilometer" )
=== added file 'test/Queries/data-cleaning/conversion/user-from-address.xq'
--- test/Queries/data-cleaning/conversion/user-from-address.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/user-from-address.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:user-from-address('5655 E Gaskill Rd, Willcox, AZ, US')
=== added file 'test/Queries/data-cleaning/conversion/user-from-phone.xq'
--- test/Queries/data-cleaning/conversion/user-from-phone.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/conversion/user-from-phone.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace conversion = "http://www.zorba-xquery.com/modules/data-cleaning/conversion";
+
+conversion:user-from-phone ('8654582358')
=== added directory 'test/Queries/data-cleaning/hybrid-string-similarity'
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/monge-elkan-jaro-winkler.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+
+simh:monge-elkan-jaro-winkler("Comput. Sci. and Eng. Dept., University of California, San Diego", "Department of Computer Scinece, Univ. Calif., San Diego", 4, 0.1)
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-edit-distance.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+
+simh:soft-cosine-tokens-edit-distance("The FLWOR Foundation", "FLWOR Found.", " +", 0 )
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro-winkler.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+
+simh:soft-cosine-tokens-jaro-winkler("The FLWOR Foundation", "FLWOR Found.", " +", 1, 4, 0.1 )
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-jaro.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+
+simh:soft-cosine-tokens-jaro("The FLWOR Foundation", "FLWOR Found.", " +", 1 )
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-metaphone.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+
+simh:soft-cosine-tokens-metaphone("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +" )
=== added file 'test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq'
--- test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/hybrid-string-similarity/soft-cosine-tokens-soundex.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simh = "http://www.zorba-xquery.com/modules/data-cleaning/hybrid-string-similarity";
+
+simh:soft-cosine-tokens-soundex("ALEKSANDER SMITH", "ALEXANDER SMYTH", " +")
=== added directory 'test/Queries/data-cleaning/normalization'
=== added file 'test/Queries/data-cleaning/normalization/normalize-address.xq'
--- test/Queries/data-cleaning/normalization/normalize-address.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/normalize-address.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";
+
+normalization:normalize-address ( ( 'Marques de Pombal' , 'Lisboa' ) )
=== added file 'test/Queries/data-cleaning/normalization/to-date.xq'
--- test/Queries/data-cleaning/normalization/to-date.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-date.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";
+
+normalization:to-date ( "24OCT2002" , "%d%b%Y" )
=== added file 'test/Queries/data-cleaning/normalization/to-dateTime.spec'
--- test/Queries/data-cleaning/normalization/to-dateTime.spec 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-dateTime.spec 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Error: http://www.zorba-xquery.com/modules/data-cleaning/normalization:notsupported
=== added file 'test/Queries/data-cleaning/normalization/to-dateTime.xq'
--- test/Queries/data-cleaning/normalization/to-dateTime.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-dateTime.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";
+
+normalization:to-dateTime( "24OCT2002 21:22" , "%d%b%Y %H%M" )
=== added file 'test/Queries/data-cleaning/normalization/to-time.spec'
--- test/Queries/data-cleaning/normalization/to-time.spec 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-time.spec 2011-12-22 13:29:42 +0000
@@ -0,0 +1,1 @@
+Error: http://www.zorba-xquery.com/modules/data-cleaning/normalization:notsupported
=== added file 'test/Queries/data-cleaning/normalization/to-time.xq'
--- test/Queries/data-cleaning/normalization/to-time.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/normalization/to-time.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace normalization = "http://www.zorba-xquery.com/modules/data-cleaning/normalization";
+
+normalization:to-time ( "09 hours 10 minutes" , "%H hours %M minutes" )
=== added directory 'test/Queries/data-cleaning/phonetic-string-similarity'
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/metaphone-key.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";
+
+simp:metaphone-key("ALEKSANDER")
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/metaphone.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";
+
+simp:metaphone-key("ALEKSANDER")
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/soundex-key.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";
+
+simp:soundex-key("Robert")
=== added file 'test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq'
--- test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/phonetic-string-similarity/soundex.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simp = "http://www.zorba-xquery.com/modules/data-cleaning/phonetic-string-similarity";
+
+simp:soundex( "Robert" , "Rupert" )
=== added directory 'test/Queries/data-cleaning/set-similarity'
=== added file 'test/Queries/data-cleaning/set-similarity/deep-intersect.xq'
--- test/Queries/data-cleaning/set-similarity/deep-intersect.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/deep-intersect.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+
+set:deep-intersect ( ( "a", "b", "c") , ( "a", "a", <d/> ) )
=== added file 'test/Queries/data-cleaning/set-similarity/deep-union.xq'
--- test/Queries/data-cleaning/set-similarity/deep-union.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/deep-union.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+
+set:deep-union ( ( "a", "b", "c") , ( "a", "a", <d/> ) )
=== added file 'test/Queries/data-cleaning/set-similarity/dice.xq'
--- test/Queries/data-cleaning/set-similarity/dice.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/dice.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+
+set:dice ( ( "a", "b", <c/> ) , ( "a", "a", "d") )
=== added file 'test/Queries/data-cleaning/set-similarity/distinct.xq'
--- test/Queries/data-cleaning/set-similarity/distinct.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/distinct.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+
+set:distinct (( "a", "a", <b/> ))
=== added file 'test/Queries/data-cleaning/set-similarity/jaccard.xq'
--- test/Queries/data-cleaning/set-similarity/jaccard.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/jaccard.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+
+set:jaccard ( ( "a", "b", <c/> ) , ( "a", "a", "d") )
=== added file 'test/Queries/data-cleaning/set-similarity/overlap.xq'
--- test/Queries/data-cleaning/set-similarity/overlap.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/set-similarity/overlap.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace set = "http://www.zorba-xquery.com/modules/data-cleaning/set-similarity";
+
+set:overlap ( ( "a", "b", <c/> ) , ( "a", "a", "b" ) )
=== added directory 'test/Queries/data-cleaning/token-based-string-similarity'
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine-ngrams.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:cosine-ngrams("DWAYNE", "DUANE", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine-tokens.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:cosine-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/cosine.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/cosine.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/cosine.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:cosine( ("aa","bb") , ("bb","aa"))
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/dice-ngrams.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:dice-ngrams("DWAYNE", "DUANE", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/dice-tokens.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:dice-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/jaccard-ngrams.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:jaccard-ngrams("DWAYNE", "DUANE", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/jaccard-tokens.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:jaccard-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/ngrams.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:ngrams("FLWOR", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/overlap-ngrams.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:overlap-ngrams("DWAYNE", "DUANE", 2 )
=== added file 'test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq'
--- test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/data-cleaning/token-based-string-similarity/overlap-tokens.xq 2011-12-22 13:29:42 +0000
@@ -0,0 +1,3 @@
+import module namespace simt = "http://www.zorba-xquery.com/modules/data-cleaning/token-based-string-similarity";
+
+simt:overlap-tokens("The FLWOR Foundation", "FLWOR Found.", " +" )