← Back to team overview

zorba-coders team mailing list archive

[Merge] lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba

 

Daniel Turcanu has proposed merging lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba.

Requested reviews:
  Sorin Marian Nasoi (sorin.marian.nasoi)

For more details, see:
https://code.launchpad.net/~danielturcanu/zorba/web_crawler_tutorial/+merge/78614

Updated the web crawler tutorial with the latest fixes in link_crawler2.xq
-- 
https://code.launchpad.net/~danielturcanu/zorba/web_crawler_tutorial/+merge/78614
Your team Zorba Coders is subscribed to branch lp:zorba.
=== added file 'doc/zorba/link_crawler2.dox'
--- doc/zorba/link_crawler2.dox	1970-01-01 00:00:00 +0000
+++ doc/zorba/link_crawler2.dox	2011-10-07 14:41:17 +0000
@@ -0,0 +1,221 @@
+/**
+\page link_crawler2  Web Crawler example in XQuery
+\code
+(:
+ : Copyright 2006-2011 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+:)
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";;
+import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";;
+import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";;
+import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";;
+import module namespace file = "http://expath.org/ns/file";;
+
+declare namespace ann = "http://www.zorba-xquery.com/annotations";;
+declare namespace xhtml="http://www.w3.org/1999/xhtml";;
+declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";;
+declare namespace err="http://www.w3.org/2005/xqt-errors";;
+declare namespace httpsch = "http://expath.org/ns/http-client";;
+
+declare variable $top-uri  as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";;
+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";;
+
+
+
+declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
+declare variable $local:processed-external-links := xs:QName("processed-external-links");
+
+
+declare %ann:sequential function local:create-containers()
+{
+  map:create($local:processed-internal-links, xs:QName("xs:string"));
+  map:create($local:processed-external-links, xs:QName("xs:string"));
+};
+
+declare %ann:sequential function local:delete-containers(){
+  for $x in map:available-maps()
+  return map:delete($x);
+};
+
+declare function local:is-internal($x as xs:string) as xs:boolean
+{
+ starts-with($x, $uri-host)
+};
+
+declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string
+{
+let $sb := fn:substring-before($s1, $s2)
+return  if($sb = "") then  $s1 else $sb
+};
+
+declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
+{
+   variable $absuri;
+   try{
+    $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
+   }
+   catch *
+   { 
+     map:insert($local:processed-external-links, fn:concat("malformed, referenced in page ", $start-uri), $href);
+   }
+   $absuri
+};
+
+
+declare  function local:get-media-type ($http-call as node()) as xs:string
+{
+   local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";")
+};
+
+declare function local:alive($http-call as item()*) as xs:boolean
+{
+ if((count($http-call) ge 1) and 
+    ($http-call[1]/@status eq 200)) 
+   then true() else fn:trace(false(), "alive")
+};
+
+
+declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+{  distinct-values( for $y in  ($content//*:a/string(@href),
+                              $content//*:link/string(@href),
+                              $content//*:script/string(@src),
+                              $content//*:img/string(@src),
+                              $content//*:area/string(@href)
+                              )
+return  local:get-real-link($y, $uri))
+};
+
+
+declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+
+      distinct-values( 
+         let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
+         for $other-uri2 in  $search//group[@nr=8]/string()
+         return local:get-real-link($other-uri2, $uri)
+     )
+};
+
+
+
+declare %ann:sequential function local:process-link($x as xs:string, $n as xs:integer) as item()*{
+ if(local:is-internal($x))
+       then local:process-internal-link($x, $n);
+       else local:process-external-link($x);
+
+};
+
+declare  %ann:sequential function local:process-external-link($x as xs:string){
+  if(not(empty(map:get($local:processed-external-links, $x))))
+         then   exit returning false();
+         else {}
+ fn:trace($x, "HEAD external link");
+ variable $http-call:=();
+  try{
+        $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
+  }
+  catch * {}
+  if( local:alive($http-call))
+          then map:insert($local:processed-external-links, "clean", $x);
+          else map:insert($local:processed-external-links, "broken", $x);
+};
+
+declare function local:tidy-options()
+{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options"; >
+                                         <tidyParam name="output-xml" value="yes" />
+                                         <tidyParam name="doctype" value="omit" />
+                                         <tidyParam name="quote-nbsp" value="no" />
+                                         <tidyParam name="char-encoding" value="utf8" />
+                                         <tidyParam name="newline" value="LF" />
+                                         <tidyParam name="tidy-mark" value="no" />
+                                         <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
+                                       </options>
+};
+
+
+declare  %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
+      (: if($n=3) then exit returning (); else {} :)
+      if(not(empty(map:get($local:processed-internal-links, $x))))
+            then exit returning false();
+              else {}
+      fn:trace($x, "GET internal link");
+       variable $http-call:=();
+       try{
+             $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+       }
+       catch * { }
+       if( not(local:alive($http-call)))
+                then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
+               else {}
+       
+       if(not (local:get-media-type($http-call[1]) = "text/html"))
+                then { map:insert($local:processed-internal-links, "clean", $x); exit returning ();}
+                else {}
+       variable $string-content := string($http-call[2]);
+       variable $content:=();
+
+       try{
+             $content:=html:parse($string-content,local:tidy-options() );
+             map:insert($local:processed-internal-links, "clean", $x);
+        }
+        catch *
+             {   
+                 map:insert($local:processed-internal-links, concat("cannot tidy ", $err:description), $x); 
+                 try{
+                       $content:=parse-xml:parse-xml-fragment ($string-content, "");
+                 }
+                 catch *
+                     { map:insert($local:processed-internal-links, concat("cannot parse ", $err:description), $x);}
+            }
+       variable $links :=();
+       if(empty($content))
+           then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
+           else $links:=local:get-out-links-parsed($content, $x);
+       for $l in $links
+       return  local:process-link($l, $n+1);
+};
+
+
+
+
+declare function local:print-results() as element()*
+{
+    for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
+    return <INTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-internal-links,$x)}</RESULT></INTERNAL>, 
+     for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
+     return <EXTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-external-links,$x)}</RESULT></EXTERNAL>
+};
+
+(:==========================================
+===========================================:)
+
+variable $uri:= $top-uri;
+
+variable $result;
+
+local:create-containers();
+local:process-link($uri, 1);
+$result:=local:print-results() ;
+
+local:delete-containers();
+
+file:write(fn:resolve-uri("link_crawler_result.xml"),
+            <result>{$result}</result>,
+            <output:serialization-parameters>
+                <output:indent value="yes"/>
+            </output:serialization-parameters>)
+
+
+\endcode
+*/
\ No newline at end of file

=== modified file 'doc/zorba/web_crawler.dox'
--- doc/zorba/web_crawler.dox	2011-10-05 22:39:18 +0000
+++ doc/zorba/web_crawler.dox	2011-10-07 14:41:17 +0000
@@ -1,8 +1,13 @@
 /**
 \page web_crawler_tutorial  Web Crawler example in XQuery
 
-Description of a web crawler example in XQuery.
+Description of a web crawler example in XQuery.<br/>
+Entire script can be seen here: 
+\link link_crawler2
+web crawler script
+\endlink
 
+<br/>
 The idea is to crawl through the pages of a website and store a list with external pages and internal pages and check if they work or not.
 This example uses Zorba's http module for accessing the webpages, and the html module for converting the html to xml.
 The complete code can be found in the test directory of the html convertor module.
@@ -12,6 +17,7 @@
 import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";;
 import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";;
 import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";;
+import module namespace file = "http://expath.org/ns/file";;
 \endcode
 
 The internal pages are checked recursively, while the external ones are only checked for existence.
@@ -55,10 +61,24 @@
 After parsing an internal page with html module, all the links are extracted and parsed recursively, if they haven't been parsed.
 The html module uses tidy library, so we use tidy options to setup for converting from html to xml. 
 Some html tags are marked to be ignored in new-inline-tags param, this being a particular case of this website. 
-You can add or remove tags to suit your website needs.
+You can add or remove tags to suit your website needs.<br/>
+The spaces in the links are trimmed and normalized, and the fragment part is ignored.
 
 \code
-declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
+{
+   variable $absuri;
+   try{
+    $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
+   }
+   catch *
+   { 
+     map:insert($local:processed-external-links, fn:concat("malformed, referenced in page ", $start-uri), $href);
+   }
+   $absuri
+};
+
+declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
 {  distinct-values( for $y in  ($content//*:a/string(@href),
                               $content//*:link/string(@href),
                               $content//*:script/string(@src),
@@ -81,39 +101,42 @@
 };
 
 declare  %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
-      if($n=3) then exit returning (); else {}
+      (: if($n=3) then exit returning (); else {} :)
       if(not(empty(map:get($local:processed-internal-links, $x))))
             then exit returning false();
               else {}
+      fn:trace($x, "GET internal link");
        variable $http-call:=();
        try{
              $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
        }
-       catch * {}
-      if( not(local:alive($http-call)))
+       catch * { }
+       if( not(local:alive($http-call)))
                 then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
                else {}
-       if(not (local:get-media-type($http-call[1]) = $supported-media-types))
-                then {map:insert($local:processed-internal-links, "clean", $x);  exit returning ();}
+       
+       if(not (local:get-media-type($http-call[1]) = "text/html"))
+                then { map:insert($local:processed-internal-links, "clean", $x); exit returning ();}
                 else {}
-       variable $string-content := xs:string($http-call[2]);
+       variable $string-content := string($http-call[2]);
        variable $content:=();
 
        try{
              $content:=html:parse($string-content,local:tidy-options() );
+             map:insert($local:processed-internal-links, "clean", $x);
         }
         catch *
-             { 
-                 map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x); 
+             {   
+                 map:insert($local:processed-internal-links, concat("cannot tidy ", $err:description), $x); 
                  try{
                        $content:=parse-xml:parse-xml-fragment ($string-content, "");
                  }
                  catch *
-                     { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
+                     { map:insert($local:processed-internal-links, concat("cannot parse ", $err:description), $x);}
             }
        variable $links :=();
        if(empty($content))
-            then $links:=local:get-out-links-unparsed($string-content, $x);
+           then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
            else $links:=local:get-out-links-parsed($content, $x);
        for $l in $links
        return  local:process-link($l, $n+1);
@@ -125,13 +148,12 @@
 When the parsing fails, we fallback to using regex for extracting the links.
 
 \code
-declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
 
       distinct-values( 
          let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
          for $other-uri2 in  $search//group[@nr=8]/string()
-         let $y:= fn:normalize-space($other-uri2)
-         return local:get-real-link($y, $uri)
+         return local:get-real-link($other-uri2, $uri)
      )
 };
 
@@ -144,6 +166,7 @@
   if(not(empty(map:get($local:processed-external-links, $x))))
          then   exit returning false();
          else {}
+ fn:trace($x, "HEAD external link");
  variable $http-call:=();
   try{
         $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());