zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #00253
[Merge] lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba
Daniel Turcanu has proposed merging lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba.
Requested reviews:
Zorba Coders (zorba-coders)
For more details, see:
https://code.launchpad.net/~danielturcanu/zorba/web_crawler_tutorial/+merge/77179
Added tutorial for web crawler script from html module (or script directory in zorba).
--
https://code.launchpad.net/~danielturcanu/zorba/web_crawler_tutorial/+merge/77179
Your team Zorba Coders is requested to review the proposed merge of lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba.
=== modified file 'doc/zorba/indexpage.dox.in'
--- doc/zorba/indexpage.dox.in 2011-09-06 16:39:46 +0000
+++ doc/zorba/indexpage.dox.in 2011-09-27 15:05:56 +0000
@@ -127,6 +127,14 @@
<!--li>\ref extensions_update</li-->
+</td></tr>
+<tr><td class="tdDocIndexTable">
+
+
+ <h2>Tutorials</h2>
+
+ \ref web_crawler_tutorial
+
</td><tr>
</table>
=== added file 'doc/zorba/web_crawler.dox'
--- doc/zorba/web_crawler.dox 1970-01-01 00:00:00 +0000
+++ doc/zorba/web_crawler.dox 2011-09-27 15:05:56 +0000
@@ -0,0 +1,173 @@
+/**
+\page web_crawler_tutorial Web Crawler example in XQuery
+
+Description of a web crawler example in XQuery.
+
+The idea is to crawl through the pages of a website and store a list with external pages and internal pages and check if they work or not.
+This example uses Zorba's http module for accessing the webpages, and the html module for converting the html to xml.
+The complete code can be found in the test directory of the html convertor module.
+
+\code
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
+import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
+import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
+import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";
+\endcode
+
+The internal pages are checked recursively, while the external ones are only checked for existence.
+The distinction between internal and external links is made by comparing the URI with a global string variable $uri-host.
+Change this variable to point to your website, or a subdirectory on your website.
+
+\code
+declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";
+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";
+
+declare function local:is-internal($x as xs:string) as xs:boolean
+{
+ starts-with($x, $uri-host)
+};
+
+\endcode
+
+The crawling starts from the URI pointed by $top-uri.
+
+Visited links are stored as nodes in two maps, one for internal pages and one for external pages.
+The keys are the URIs, and the values are the strings "broken" or "clean".
+The maps are used to avoid parsing the same page twice.
+
+\code
+declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
+declare variable $local:processed-external-links := xs:QName("processed-external-links");
+
+declare %ann:sequential function local:create-containers()
+{
+ map:create($local:processed-internal-links, xs:QName("xs:string"));
+ map:create($local:processed-external-links, xs:QName("xs:string"));
+};
+
+declare %ann:sequential function local:delete-containers(){
+ for $x in map:available-maps()
+ return map:delete($x);
+};
+
+\endcode
+
+After parsing an internal page with html module, all the links are extracted and parsed recursively, if they haven't been parsed.
+The html module uses tidy library, so we use tidy options to setup for converting from html to xml.
+Some html tags are marked to be ignored in new-inline-tags param, this being a particular case of this website.
+You can add or remove tags to suit your website needs.
+
+\code
+declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+{ distinct-values( for $y in ($content//*:a/string(@href),
+ $content//*:link/string(@href),
+ $content//*:script/string(@src),
+ $content//*:img/string(@src),
+ $content//*:area/string(@href)
+ )
+return local:get-real-link($y, $uri))
+};
+
+declare function local:tidy-options()
+{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
+ <tidyParam name="output-xml" value="yes" />
+ <tidyParam name="doctype" value="omit" />
+ <tidyParam name="quote-nbsp" value="no" />
+ <tidyParam name="char-encoding" value="utf8" />
+ <tidyParam name="newline" value="LF" />
+ <tidyParam name="tidy-mark" value="no" />
+ <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
+ </options>
+};
+
+declare %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
+ if($n=3) then exit returning (); else {}
+ if(not(empty(map:get($local:processed-internal-links, $x))))
+ then exit returning false();
+ else {}
+ variable $http-call:=();
+ try{
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ }
+ catch * {}
+ if( not(local:alive($http-call)))
+ then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
+ else {}
+ if(not (local:get-media-type($http-call[1]) = $supported-media-types))
+ then {map:insert($local:processed-internal-links, "clean", $x); exit returning ();}
+ else {}
+ variable $string-content := xs:string($http-call[2]);
+ variable $content:=();
+
+ try{
+ $content:=html:parse($string-content,local:tidy-options() );
+ }
+ catch *
+ {
+ map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x);
+ try{
+ $content:=parse-xml:parse-xml-fragment ($string-content, "");
+ }
+ catch *
+ { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
+ }
+ variable $links :=();
+ if(empty($content))
+ then $links:=local:get-out-links-unparsed($string-content, $x);
+ else $links:=local:get-out-links-parsed($content, $x);
+ for $l in $links
+ return local:process-link($l, $n+1);
+};
+
+\endcode
+
+Some html pages have errors, and tidy library is very strict with checking errors.
+When the parsing fails, we fallback to using regex for extracting the links.
+
+\code
+declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+
+ distinct-values(
+ let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
+ for $other-uri2 in $search//group[@nr=8]/string()
+ let $y:= fn:normalize-space($other-uri2)
+ return local:get-real-link($y, $uri)
+ )
+};
+
+\endcode
+
+For external links, we just check if they exist, so the http command requests only for HEAD.
+
+\code
+declare %ann:sequential function local:process-external-link($x as xs:string){
+ if(not(empty(map:get($local:processed-external-links, $x))))
+ then exit returning false();
+ else {}
+ variable $http-call:=();
+ try{
+ $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
+ }
+ catch * {}
+ if( local:alive($http-call))
+ then map:insert($local:processed-external-links, "clean", $x);
+ else map:insert($local:processed-external-links, "broken", $x);
+};
+
+\endcode
+
+After parsing, the results are returned in xml format.
+
+\code
+declare function local:print-results() as element()*
+{
+ for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
+ return <INTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-internal-links,$x)}</RESULT></INTERNAL>,
+
+ for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
+ return <EXTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-external-links,$x)}</RESULT></EXTERNAL>
+};
+
+\endcode
+
+*/
\ No newline at end of file
Follow ups