← Back to team overview

zorba-coders team mailing list archive

[Merge] lp:~zorba-coders/zorba/my_conv_module into lp:zorba/data-converters-module

 

Sorin Marian Nasoi has proposed merging lp:~zorba-coders/zorba/my_conv_module into lp:zorba/data-converters-module.

Requested reviews:
  Zorba Coders (zorba-coders)
  Nicolae Brinza (nbrinza)
  Sorin Marian Nasoi (sorin.marian.nasoi)

For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/my_conv_module/+merge/95051

Upgraded web_crawler.xq .
Plus removed some expected failures for ZORBA_NO_ICU, because the forcomming no_unicode branch already fixes those.

This branch was started from
lp:~danielturcanu/zorba/my_conv_module
developed by Daniel Turcanu.
-- 
https://code.launchpad.net/~zorba-coders/zorba/my_conv_module/+merge/95051
Your team Zorba Coders is requested to review the proposed merge of lp:~zorba-coders/zorba/my_conv_module into lp:zorba/data-converters-module.
=== modified file 'src/com/zorba-xquery/www/modules/converters/CMakeLists.txt'
--- src/com/zorba-xquery/www/modules/converters/CMakeLists.txt	2012-01-23 19:28:30 +0000
+++ src/com/zorba-xquery/www/modules/converters/CMakeLists.txt	2012-02-28 21:12:22 +0000
@@ -59,7 +59,3 @@
 ADD_TEST ("streamable_string_streambuf" StreambufUnitTests "streambuftest")
 ADD_TEST_DIRECTORY("${PROJECT_SOURCE_DIR}/test")
 
-IF(ZORBA_NO_UNICODE)
-  SET_TESTS_PROPERTIES(zorba_data-converters_module/converters/csv/csv_parse_utf8_11.xq
-                      PROPERTIES WILL_FAIL TRUE)
-ENDIF(ZORBA_NO_UNICODE)

=== modified file 'test_html/Queries/converters/html/link_crawler2.xq2'
--- test_html/Queries/converters/html/link_crawler2.xq2	2011-10-06 07:40:17 +0000
+++ test_html/Queries/converters/html/link_crawler2.xq2	2012-02-28 21:12:22 +0000
@@ -18,6 +18,7 @@
 import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";;
 import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";;
 import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";;
+import module namespace file = "http://expath.org/ns/file";;
 
 declare namespace ann = "http://www.zorba-xquery.com/annotations";;
 declare namespace xhtml="http://www.w3.org/1999/xhtml";;
@@ -25,16 +26,23 @@
 declare namespace err="http://www.w3.org/2005/xqt-errors";;
 declare namespace httpsch = "http://expath.org/ns/http-client";;
 
-declare variable $top-uri  as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";;
-declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";;
-
-
-declare variable $supported-media-types as xs:string+ := ("text/xml", "application/xml", "text/xml-external-parsed-entity", "application/xml-external-parsed-entity",
-                         "application/atom+xml", "text/html");
-
-
-declare variable $local:processed-internal-links:=xs:QName("processed-internal-links");
-declare variable $local:processed-external-links  :=xs:QName("processed-external-links");
+declare variable $top-uri  as xs:string := "http://www.zorba-xquery.com/html/index/";;
+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com";;
+
+
+
+declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
+declare variable $local:processed-external-links := xs:QName("processed-external-links");
+declare variable $local:tidy-options := <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options"; >
+                                         <tidyParam name="output-xml" value="yes" />
+                                         <tidyParam name="doctype" value="omit" />
+                                         <tidyParam name="quote-nbsp" value="no" />
+                                         <tidyParam name="char-encoding" value="utf8" />
+                                         <tidyParam name="newline" value="LF" />
+                                         <tidyParam name="tidy-mark" value="no" />
+                                         <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
+                                        </options>;
+
 
 
 declare %ann:sequential function local:create-containers()
@@ -59,9 +67,19 @@
 return  if($sb = "") then  $s1 else $sb
 };
 
-declare function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string
+declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
 {
-   local:my-substring-before(resolve-uri($href, $start-uri), "#")
+   variable $absuri;
+   try{
+    $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
+   }
+   catch *
+   { 
+     map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>, 
+                                                  <MESSAGE>malformed</MESSAGE>,
+                                                  <RESULT>broken</RESULT>), $href);
+   }
+   $absuri
 };
 
 
@@ -70,13 +88,22 @@
    local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";")
 };
 
-declare function local:alive($http-call as node()*) as xs:boolean
-{
- if(($http-call[1]/@status eq 200)) then true() else false()
-};
-
-
-declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+declare function local:alive($http-call as item()*) as xs:boolean
+{
+ if((count($http-call) ge 1) and 
+    ($http-call[1]/@status eq 200)) 
+   then true() else fn:trace(false(), "alive")
+};
+
+declare function local:is-redirect($http-call as item()*) as xs:boolean
+{
+ if((count($http-call) ge 1) and 
+    (($http-call[1]/@status idiv 100) eq 3)) 
+   then fn:trace(true(), "redirect") else false()
+};
+
+
+declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
 {  distinct-values( for $y in  ($content//*:a/string(@href),
                               $content//*:link/string(@href),
                               $content//*:script/string(@src),
@@ -87,100 +114,132 @@
 };
 
 
-declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
 
       distinct-values( 
          let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
          for $other-uri2 in  $search//group[@nr=8]/string()
-         let $y:= fn:normalize-space($other-uri2)
-         return local:get-real-link($y, $uri)
+         return local:get-real-link($other-uri2, $uri)
      )
 };
 
 
-
-declare  %ann:sequential function local:process-external-link($x as xs:string){
+declare %ann:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*) 
+{
+  if(count($http-result) ge 1) 
+    then 
+      map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>,
+                             <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>,
+                             <RESULT>{if(local:alive($http-result)) 
+                                        then "Ok" 
+                                        else if(local:is-redirect($http-result))
+                                                then "redirect" 
+                                                else "broken"
+                             }</RESULT>), $url);
+    else map:insert($map-name, <RESULT>broken</RESULT>, $url);
+  if(local:is-redirect($http-result)) then
+    map:insert($map-name, <REDIRECT>{fn:string($http-result[1]/httpsch:header[@name = "Location"]/@value)}</REDIRECT>, $url);
+  else {}
+};
+
+declare %ann:sequential function local:process-link($x as xs:string, $baseUri as xs:string, $n as xs:integer) as item()*{
+ if(local:is-internal($x))
+       then local:process-internal-link($x, $baseUri, $n);
+       else local:process-external-link($x, $baseUri);
+
+};
+
+declare  %ann:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){
   if(not(empty(map:get($local:processed-external-links, $x))))
          then   exit returning false();
          else {}
- variable $http-call:=();
+  fn:trace($x, "HEAD external link");
+  map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x);
+  variable $http-call:=();
   try{
-        $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
+        $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+        if((count($http-call) ge 1) and 
+            fn:not($http-call[1]/@status eq 200)) then
+        {
+           if(local:is-redirect($http-call)) then
+           {
+             local:map-insert-result($local:processed-external-links, $x, $http-call);
+           }
+           else {} 
+           $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+           local:map-insert-result($local:processed-external-links, $x, $http-call); 
+        }
+        else
+        {}
   }
-  catch * {}
-  if( local:alive($http-call))
-          then map:insert($local:processed-external-links, "clean", $x);
-          else map:insert($local:processed-external-links, "broken", $x);
-};
-
-declare function local:tidy-options()
-{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options"; >
-                                         <tidyParam name="output-xml" value="yes" />
-                                         <tidyParam name="doctype" value="omit" />
-                                         <tidyParam name="quote-nbsp" value="no" />
-                                         <tidyParam name="char-encoding" value="utf8" />
-                                         <tidyParam name="newline" value="LF" />
-                                         <tidyParam name="tidy-mark" value="no" />
-                                         <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
-                                       </options>
-};
-
-
-declare  %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
-      if($n=3) then exit returning (); else {}
+  catch * 
+  { $http-call:=();}
+  local:map-insert-result($local:processed-external-links, $x, $http-call); 
+};
+
+
+declare  %ann:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){
+      (: if($n=3) then exit returning (); else {} :)
       if(not(empty(map:get($local:processed-internal-links, $x))))
             then exit returning false();
               else {}
+      fn:trace($x, "GET internal link");
+      map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x);
        variable $http-call:=();
        try{
-             $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
-       }
-       catch * {}
-      if( not(local:alive($http-call)))
-                then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
-               else {}
-       if(not (local:get-media-type($http-call[1]) = $supported-media-types))
-                then {map:insert($local:processed-internal-links, "clean", $x);  exit returning ();}
-                else {}
-       variable $string-content := xs:string($http-call[2]);
+          $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}" follow-redirect="false"/>, (), ());
+       }
+       catch * { }
+       if(local:is-redirect($http-call)) then
+       {
+         local:map-insert-result($local:processed-internal-links, $x, $http-call);
+         try{
+            $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+         }
+         catch * { }
+       }
+       else {}
+       if( not(local:alive($http-call)))
+               then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
+               else {}
+       
+       if(not (local:get-media-type($http-call[1]) = "text/html"))
+               then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
+               else {}
+       variable $string-content := string($http-call[2]);
        variable $content:=();
 
        try{
-             $content:=html:parse($string-content,local:tidy-options() );
+             $content:=html:parse($string-content,$local:tidy-options );
+             local:map-insert-result($local:processed-internal-links, $x, $http-call); 
         }
         catch *
-             { 
-                 map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x); 
+             {   
+                 map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>,
+                                                              <RESULT>broken</RESULT>), $x); 
                  try{
                        $content:=parse-xml:parse-xml-fragment ($string-content, "");
                  }
                  catch *
-                     { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
+                     { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);}
             }
        variable $links :=();
        if(empty($content))
-            then $links:=local:get-out-links-unparsed($string-content, $x);
+           then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
            else $links:=local:get-out-links-parsed($content, $x);
        for $l in $links
-       return  local:process-link($l, $n+1);
-};
-
-
-
-declare %ann:sequential function local:process-link($x as xs:string, $n as xs:integer) as item()*{
- if(local:is-internal($x))
-       then local:process-internal-link($x, $n);
-       else local:process-external-link($x);
-
-};
+       return  local:process-link($l, $x, $n+1);
+};
+
+
 
 
 declare function local:print-results() as element()*
 {
     for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
-    return <INTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-internal-links,$x)}</RESULT></INTERNAL>, 
+    return <INTERNAL><LINK>{$x}</LINK>{map:get($local:processed-internal-links,$x)}</INTERNAL>, 
      for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
-     return <EXTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-external-links,$x)}</RESULT></EXTERNAL>
+     return <EXTERNAL><LINK>{$x}</LINK>{map:get($local:processed-external-links,$x)}</EXTERNAL>
 };
 
 (:==========================================
@@ -190,13 +249,15 @@
 
 variable $result;
 
-try {
-  local:create-containers();
-  local:process-link($uri, 1);
-  $result:=local:print-results() ;
-}
-catch * { $result:=concat("an error occurred", $err:description);}
-
+local:create-containers();
+local:process-link($uri, "", 1);
+$result:=local:print-results() ;
 
 local:delete-containers();
-$result
+
+file:write(fn:resolve-uri("link_crawler_result.xml"),
+            <result>{$result}</result>,
+            <output:serialization-parameters>
+                <output:indent value="yes"/>
+            </output:serialization-parameters>)
+


Follow ups