← Back to team overview

zorba-coders team mailing list archive

[Merge] lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba

 

Daniel Turcanu has proposed merging lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba.

Requested reviews:
  Zorba Coders (zorba-coders)

For more details, see:
https://code.launchpad.net/~danielturcanu/zorba/web_crawler_tutorial/+merge/78243
-- 
https://code.launchpad.net/~danielturcanu/zorba/web_crawler_tutorial/+merge/78243
Your team Zorba Coders is requested to review the proposed merge of lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba.
=== added file 'doc/zorba/link_crawler2.dox'
--- doc/zorba/link_crawler2.dox	1970-01-01 00:00:00 +0000
+++ doc/zorba/link_crawler2.dox	2011-10-05 12:23:32 +0000
@@ -0,0 +1,208 @@
+/**
+\page link_crawler2  Web Crawler example in XQuery
+\code
+(:
+ : Copyright 2006-2011 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+:)
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";;
+import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";;
+import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";;
+import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";;
+
+declare namespace ann = "http://www.zorba-xquery.com/annotations";;
+declare namespace xhtml="http://www.w3.org/1999/xhtml";;
+declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";;
+declare namespace err="http://www.w3.org/2005/xqt-errors";;
+declare namespace httpsch = "http://expath.org/ns/http-client";;
+
+declare variable $top-uri  as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";;
+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";;
+
+
+declare variable $supported-media-types as xs:string+ := ("text/xml", "application/xml", "text/xml-external-parsed-entity", "application/xml-external-parsed-entity",
+                         "application/atom+xml", "text/html");
+
+
+declare variable $local:processed-internal-links:=xs:QName("processed-internal-links");
+declare variable $local:processed-external-links  :=xs:QName("processed-external-links");
+
+
+declare %ann:sequential function local:create-containers()
+{
+  map:create($local:processed-internal-links, xs:QName("xs:string"));
+  map:create($local:processed-external-links, xs:QName("xs:string"));
+};
+
+declare %ann:sequential function local:delete-containers(){
+  for $x in map:available-maps()
+  return map:delete($x);
+};
+
+declare function local:is-internal($x as xs:string) as xs:boolean
+{
+ starts-with($x, $uri-host)
+};
+
+declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string
+{
+let $sb := fn:substring-before($s1, $s2)
+return  if($sb = "") then  $s1 else $sb
+};
+
+declare function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string
+{
+   local:my-substring-before(resolve-uri($href, $start-uri), "#")
+};
+
+
+declare  function local:get-media-type ($http-call as node()) as xs:string
+{
+   local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";")
+};
+
+declare function local:alive($http-call as node()*) as xs:boolean
+{
+ if(($http-call[1]/@status eq 200)) then true() else false()
+};
+
+
+declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+{  distinct-values( for $y in  ($content//*:a/string(@href),
+                              $content//*:link/string(@href),
+                              $content//*:script/string(@src),
+                              $content//*:img/string(@src),
+                              $content//*:area/string(@href)
+                              )
+return  local:get-real-link($y, $uri))
+};
+
+
+declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+
+      distinct-values( 
+         let $search := fn:analyze-string($content, "(&lt;|&amp;lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
+         for $other-uri2 in  $search//group[@nr=8]/string()
+         let $y:= fn:normalize-space($other-uri2)
+         return local:get-real-link($y, $uri)
+     )
+};
+
+
+
+declare  %ann:sequential function local:process-external-link($x as xs:string){
+  if(not(empty(map:get($local:processed-external-links, $x))))
+         then   exit returning false();
+         else {}
+ variable $http-call:=();
+  try{
+        $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
+  }
+  catch * {}
+  if( local:alive($http-call))
+          then map:insert($local:processed-external-links, "clean", $x);
+          else map:insert($local:processed-external-links, "broken", $x);
+};
+
+declare function local:tidy-options()
+{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options"; >
+                                         <tidyParam name="output-xml" value="yes" />
+                                         <tidyParam name="doctype" value="omit" />
+                                         <tidyParam name="quote-nbsp" value="no" />
+                                         <tidyParam name="char-encoding" value="utf8" />
+                                         <tidyParam name="newline" value="LF" />
+                                         <tidyParam name="tidy-mark" value="no" />
+                                         <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
+                                       </options>
+};
+
+
+declare  %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
+      if($n=3) then exit returning (); else {}
+      if(not(empty(map:get($local:processed-internal-links, $x))))
+            then exit returning false();
+              else {}
+       variable $http-call:=();
+       try{
+             $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+       }
+       catch * {}
+      if( not(local:alive($http-call)))
+                then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
+               else {}
+       if(not (local:get-media-type($http-call[1]) = $supported-media-types))
+                then {map:insert($local:processed-internal-links, "clean", $x);  exit returning ();}
+                else {}
+       variable $string-content := xs:string($http-call[2]);
+       variable $content:=();
+
+       try{
+             $content:=html:parse($string-content,local:tidy-options() );
+        }
+        catch *
+             { 
+                 map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x); 
+                 try{
+                       $content:=parse-xml:parse-xml-fragment ($string-content, "");
+                 }
+                 catch *
+                     { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
+            }
+       variable $links :=();
+       if(empty($content))
+            then $links:=local:get-out-links-unparsed($string-content, $x);
+           else $links:=local:get-out-links-parsed($content, $x);
+       for $l in $links
+       return  local:process-link($l, $n+1);
+};
+
+
+
+declare %ann:sequential function local:process-link($x as xs:string, $n as xs:integer) as item()*{
+ if(local:is-internal($x))
+       then local:process-internal-link($x, $n);
+       else local:process-external-link($x);
+
+};
+
+
+declare function local:print-results() as element()*
+{
+    for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
+    return <INTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-internal-links,$x)}</RESULT></INTERNAL>, 
+     for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
+     return <EXTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-external-links,$x)}</RESULT></EXTERNAL>
+};
+
+(:==========================================
+===========================================:)
+
+variable $uri:= $top-uri;
+
+variable $result;
+
+try {
+  local:create-containers();
+  local:process-link($uri, 1);
+  $result:=local:print-results() ;
+}
+catch * { $result:=concat("an error occurred", $err:description);}
+
+
+local:delete-containers();
+$result
+
+\endcode
+*/
\ No newline at end of file

=== modified file 'doc/zorba/web_crawler.dox'
--- doc/zorba/web_crawler.dox	2011-09-27 13:23:00 +0000
+++ doc/zorba/web_crawler.dox	2011-10-05 12:23:32 +0000
@@ -1,8 +1,13 @@
 /**
 \page web_crawler_tutorial  Web Crawler example in XQuery
 
-Description of a web crawler example in XQuery.
+Description of a web crawler example in XQuery.<br/>
+Entire script can be seen here: 
+\link link_crawler2
+web crawler script
+\endlink
 
+<br/>
 The idea is to crawl through the pages of a website and store a list with external pages and internal pages and check if they work or not.
 This example uses Zorba's http module for accessing the webpages, and the html module for converting the html to xml.
 The complete code can be found in the test directory of the html convertor module.


Follow ups