zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #02808
[Merge] lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba
Daniel Turcanu has proposed merging lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba.
Requested reviews:
Zorba Coders (zorba-coders)
For more details, see:
https://code.launchpad.net/~danielturcanu/zorba/web_crawler_tutorial/+merge/85669
Updated the web crawler tutorial
--
https://code.launchpad.net/~danielturcanu/zorba/web_crawler_tutorial/+merge/85669
Your team Zorba Coders is requested to review the proposed merge of lp:~danielturcanu/zorba/web_crawler_tutorial into lp:zorba.
=== added file 'doc/zorba/link_crawler2.dox'
--- doc/zorba/link_crawler2.dox 1970-01-01 00:00:00 +0000
+++ doc/zorba/link_crawler2.dox 2011-12-14 14:31:06 +0000
@@ -0,0 +1,238 @@
+/**
+\page link_crawler2 Web Crawler example in XQuery
+\code
+(:
+ : Copyright 2006-2011 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+:)
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
+import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
+import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
+import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";
+import module namespace file = "http://expath.org/ns/file";
+
+declare namespace ann = "http://www.zorba-xquery.com/annotations";
+declare namespace xhtml="http://www.w3.org/1999/xhtml";
+declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";
+declare namespace err="http://www.w3.org/2005/xqt-errors";
+declare namespace httpsch = "http://expath.org/ns/http-client";
+
+declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";
+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com";
+
+
+
+declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
+declare variable $local:processed-external-links := xs:QName("processed-external-links");
+
+
+declare %ann:sequential function local:create-containers()
+{
+ map:create($local:processed-internal-links, xs:QName("xs:string"));
+ map:create($local:processed-external-links, xs:QName("xs:string"));
+};
+
+declare %ann:sequential function local:delete-containers(){
+ for $x in map:available-maps()
+ return map:delete($x);
+};
+
+declare function local:is-internal($x as xs:string) as xs:boolean
+{
+ starts-with($x, $uri-host)
+};
+
+declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string
+{
+let $sb := fn:substring-before($s1, $s2)
+return if($sb = "") then $s1 else $sb
+};
+
+declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
+{
+ variable $absuri;
+ try{
+ $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
+ }
+ catch *
+ {
+ map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>,
+ <MESSAGE>malformed</MESSAGE>,
+ <RESULT>broken</RESULT>), $href);
+ }
+ $absuri
+};
+
+
+declare function local:get-media-type ($http-call as node()) as xs:string
+{
+ local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";")
+};
+
+declare function local:alive($http-call as item()*) as xs:boolean
+{
+ if((count($http-call) ge 1) and
+ ($http-call[1]/@status eq 200))
+ then true() else fn:trace(false(), "alive")
+};
+
+
+declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+{ distinct-values( for $y in ($content//*:a/string(@href),
+ $content//*:link/string(@href),
+ $content//*:script/string(@src),
+ $content//*:img/string(@src),
+ $content//*:area/string(@href)
+ )
+return local:get-real-link($y, $uri))
+};
+
+
+declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+
+ distinct-values(
+ let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
+ for $other-uri2 in $search//group[@nr=8]/string()
+ return local:get-real-link($other-uri2, $uri)
+ )
+};
+
+
+declare %ann:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*)
+{
+ if(count($http-result) ge 1)
+ then
+ map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>,
+ <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>,
+ <RESULT>{if(local:alive($http-result)) then "Ok" else "broken"}</RESULT>), $url)
+ else map:insert($map-name, <RESULT>broken</RESULT>, $url)
+};
+
+declare %ann:sequential function local:process-link($x as xs:string, $baseUri as xs:string, $n as xs:integer) as item()*{
+ if(local:is-internal($x))
+ then local:process-internal-link($x, $baseUri, $n);
+ else local:process-external-link($x, $baseUri);
+
+};
+
+declare %ann:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){
+ if(not(empty(map:get($local:processed-external-links, $x))))
+ then exit returning false();
+ else {}
+ fn:trace($x, "HEAD external link");
+ map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x);
+ variable $http-call:=();
+ try{
+ $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
+ if((count($http-call) ge 1) and
+ fn:not($http-call[1]/@status eq 200)) then
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ else
+ ();
+ }
+ catch * { }
+ local:map-insert-result($local:processed-external-links, $x, $http-call);
+};
+
+declare function local:tidy-options()
+{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
+ <tidyParam name="output-xml" value="yes" />
+ <tidyParam name="doctype" value="omit" />
+ <tidyParam name="quote-nbsp" value="no" />
+ <tidyParam name="char-encoding" value="utf8" />
+ <tidyParam name="newline" value="LF" />
+ <tidyParam name="tidy-mark" value="no" />
+ <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
+ </options>
+};
+
+
+declare %ann:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){
+ (: if($n=3) then exit returning (); else {} :)
+ if(not(empty(map:get($local:processed-internal-links, $x))))
+ then exit returning false();
+ else {}
+ fn:trace($x, "GET internal link");
+ map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x);
+ variable $http-call:=();
+ try{
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ }
+ catch * { }
+ if( not(local:alive($http-call)))
+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
+ else {}
+
+ if(not (local:get-media-type($http-call[1]) = "text/html"))
+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
+ else {}
+ variable $string-content := string($http-call[2]);
+ variable $content:=();
+
+ try{
+ $content:=html:parse($string-content,local:tidy-options() );
+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
+ }
+ catch *
+ {
+ map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>,
+ <RESULT>broken</RESULT>), $x);
+ try{
+ $content:=parse-xml:parse-xml-fragment ($string-content, "");
+ }
+ catch *
+ { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);}
+ }
+ variable $links :=();
+ if(empty($content))
+ then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
+ else $links:=local:get-out-links-parsed($content, $x);
+ for $l in $links
+ return local:process-link($l, $x, $n+1);
+};
+
+
+
+
+declare function local:print-results() as element()*
+{
+ for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
+ return <INTERNAL><LINK>{$x}</LINK>{map:get($local:processed-internal-links,$x)}</INTERNAL>,
+ for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
+ return <EXTERNAL><LINK>{$x}</LINK>{map:get($local:processed-external-links,$x)}</EXTERNAL>
+};
+
+(:==========================================
+===========================================:)
+
+variable $uri:= $top-uri;
+
+variable $result;
+
+local:create-containers();
+local:process-link($uri, "", 1);
+$result:=local:print-results() ;
+
+local:delete-containers();
+
+file:write(fn:resolve-uri("link_crawler_result.xml"),
+ <result>{$result}</result>,
+ <output:serialization-parameters>
+ <output:indent value="yes"/>
+ </output:serialization-parameters>)
+
+
+\endcode
+*/
\ No newline at end of file
=== modified file 'doc/zorba/web_crawler.dox'
--- doc/zorba/web_crawler.dox 2011-10-07 08:28:43 +0000
+++ doc/zorba/web_crawler.dox 2011-12-14 14:31:06 +0000
@@ -1,17 +1,23 @@
/**
\page web_crawler_tutorial Web Crawler example in XQuery
-Description of a web crawler example in XQuery.
+Description of a web crawler example in XQuery.<br/>
+Entire script can be seen here:
+\link link_crawler2
+web crawler script
+\endlink
+<br/>
The idea is to crawl through the pages of a website and store a list with external pages and internal pages and check if they work or not.
This example uses Zorba's http module for accessing the webpages, and the html module for converting the html to xml.
-The complete code can be found in the test directory of the html convertor module.
+The complete code can be found in the test directory of the html convertor module (link_crawler2.xq2).
\code
import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";
+import module namespace file = "http://expath.org/ns/file";
\endcode
The internal pages are checked recursively, while the external ones are only checked for existence.
@@ -19,8 +25,8 @@
Change this variable to point to your website, or a subdirectory on your website.
\code
-declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";
-declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";
+declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/";
+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com";
declare function local:is-internal($x as xs:string) as xs:boolean
{
@@ -32,7 +38,7 @@
The crawling starts from the URI pointed by $top-uri.
Visited links are stored as nodes in two maps, one for internal pages and one for external pages.
-The keys are the URIs, and the values are the strings "broken" or "clean".
+The keys are the URIs, and the values are the strings "broken" or "clean", plus error codes if processing failed.
The maps are used to avoid parsing the same page twice.
\code
@@ -55,10 +61,36 @@
After parsing an internal page with html module, all the links are extracted and parsed recursively, if they haven't been parsed.
The html module uses tidy library, so we use tidy options to setup for converting from html to xml.
Some html tags are marked to be ignored in new-inline-tags param, this being a particular case of this website.
-You can add or remove tags to suit your website needs.
+You can add or remove tags to suit your website needs.<br/>
+The spaces in the url links are trimmed and normalized, and the fragment part is ignored.
\code
-declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+declare variable $local:tidy-options := <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
+ <tidyParam name="output-xml" value="yes" />
+ <tidyParam name="doctype" value="omit" />
+ <tidyParam name="quote-nbsp" value="no" />
+ <tidyParam name="char-encoding" value="utf8" />
+ <tidyParam name="newline" value="LF" />
+ <tidyParam name="tidy-mark" value="no" />
+ <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
+ </options>;
+
+declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
+{
+ variable $absuri;
+ try{
+ $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
+ }
+ catch *
+ {
+ map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>,
+ <MESSAGE>malformed</MESSAGE>,
+ <RESULT>broken</RESULT>), $href);
+ }
+ $absuri
+};
+
+declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
{ distinct-values( for $y in ($content//*:a/string(@href),
$content//*:link/string(@href),
$content//*:script/string(@src),
@@ -68,90 +100,128 @@
return local:get-real-link($y, $uri))
};
-declare function local:tidy-options()
-{<options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
- <tidyParam name="output-xml" value="yes" />
- <tidyParam name="doctype" value="omit" />
- <tidyParam name="quote-nbsp" value="no" />
- <tidyParam name="char-encoding" value="utf8" />
- <tidyParam name="newline" value="LF" />
- <tidyParam name="tidy-mark" value="no" />
- <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
- </options>
+
+declare %ann:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*)
+{
+ if(count($http-result) ge 1)
+ then
+ map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>,
+ <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>,
+ <RESULT>{if(local:alive($http-result))
+ then "Ok"
+ else if(local:is-redirect($http-result))
+ then "redirect"
+ else "broken"
+ }</RESULT>), $url);
+ else map:insert($map-name, <RESULT>broken</RESULT>, $url);
+ if(local:is-redirect($http-result)) then
+ map:insert($map-name, <REDIRECT>{fn:string($http-result[1]/httpsch:header[@name = "Location"]/@value)}</REDIRECT>, $url);
+ else {}
};
-declare %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
- if($n=3) then exit returning (); else {}
+declare %ann:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){
if(not(empty(map:get($local:processed-internal-links, $x))))
then exit returning false();
else {}
+ fn:trace($x, "GET internal link");
+ map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x);
variable $http-call:=();
try{
- $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
- }
- catch * {}
- if( not(local:alive($http-call)))
- then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
- else {}
- if(not (local:get-media-type($http-call[1]) = $supported-media-types))
- then {map:insert($local:processed-internal-links, "clean", $x); exit returning ();}
- else {}
- variable $string-content := xs:string($http-call[2]);
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}" follow-redirect="false"/>, (), ());
+ }
+ catch * { }
+ if(local:is-redirect($http-call)) then
+ {
+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
+ try{
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ }
+ catch * { }
+ }
+ else {}
+ if( not(local:alive($http-call)))
+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
+ else {}
+
+ if(not (local:get-media-type($http-call[1]) = "text/html"))
+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
+ else {}
+ variable $string-content := string($http-call[2]);
variable $content:=();
try{
- $content:=html:parse($string-content,local:tidy-options() );
+ $content:=html:parse($string-content,$local:tidy-options );
+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
}
catch *
- {
- map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x);
+ {
+ map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>,
+ <RESULT>broken</RESULT>), $x);
try{
$content:=parse-xml:parse-xml-fragment ($string-content, "");
}
catch *
- { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
+ { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);}
}
variable $links :=();
if(empty($content))
- then $links:=local:get-out-links-unparsed($string-content, $x);
+ then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
else $links:=local:get-out-links-parsed($content, $x);
for $l in $links
- return local:process-link($l, $n+1);
+ return local:process-link($l, $x, $n+1);
};
\endcode
+For each parsed link, we store the FROM, STATUS, MESSAGE and RESULT. The RESULT is "Ok" if everything went fine,
+or "broken" if the page couldn't be retrieved or passed, and in this case MESSAGE contains the error message.
+The FROM element contains the parent url for that link.<br/>
+<br/>
Some html pages have errors, and tidy library is very strict with checking errors.
When the parsing fails, we fallback to using regex for extracting the links.
\code
-declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
distinct-values(
let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
for $other-uri2 in $search//group[@nr=8]/string()
- let $y:= fn:normalize-space($other-uri2)
- return local:get-real-link($y, $uri)
+ return local:get-real-link($other-uri2, $uri)
)
};
\endcode
For external links, we just check if they exist, so the http command requests only for HEAD.
+Some websites return error for HEAD, in this case we revert to use GET.
\code
-declare %ann:sequential function local:process-external-link($x as xs:string){
+declare %ann:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){
if(not(empty(map:get($local:processed-external-links, $x))))
then exit returning false();
else {}
- variable $http-call:=();
+ fn:trace($x, "HEAD external link");
+ map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x);
+ variable $http-call:=();
try{
- $http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ if((count($http-call) ge 1) and
+ fn:not($http-call[1]/@status eq 200)) then
+ {
+ if(local:is-redirect($http-call)) then
+ {
+ local:map-insert-result($local:processed-external-links, $x, $http-call);
+ }
+ else {}
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ local:map-insert-result($local:processed-external-links, $x, $http-call);
+ }
+ else
+ {}
}
- catch * {}
- if( local:alive($http-call))
- then map:insert($local:processed-external-links, "clean", $x);
- else map:insert($local:processed-external-links, "broken", $x);
+ catch *
+ { $http-call:=();}
+ local:map-insert-result($local:processed-external-links, $x, $http-call);
};
\endcode
@@ -170,4 +240,29 @@
\endcode
-*/
\ No newline at end of file
+The main program calls the recursive function local:process-link for the $top-uri.
+
+\code
+(:==========================================
+===========================================:)
+
+variable $uri:= $top-uri;
+
+variable $result;
+
+local:create-containers();
+local:process-link($uri, "", 1);
+$result:=local:print-results() ;
+
+local:delete-containers();
+
+file:write(fn:resolve-uri("link_crawler_result.xml"),
+ <result>{$result}</result>,
+ <output:serialization-parameters>
+ <output:indent value="yes"/>
+ </output:serialization-parameters>)
+
+\endcode
+
+
+*/
=== removed file 'scripts/link_crawler.xq'
--- scripts/link_crawler.xq 2011-08-18 20:07:20 +0000
+++ scripts/link_crawler.xq 1970-01-01 00:00:00 +0000
@@ -1,232 +0,0 @@
-(:
- : Copyright 2006-2011 The FLWOR Foundation.
- :
- : Licensed under the Apache License, Version 2.0 (the "License");
- : you may not use this file except in compliance with the License.
- : You may obtain a copy of the License at
- :
- : http://www.apache.org/licenses/LICENSE-2.0
- :
- : Unless required by applicable law or agreed to in writing, software
- : distributed under the License is distributed on an "AS IS" BASIS,
- : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- : See the License for the specific language governing permissions and
- : limitations under the License.
-:)
-
-import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
-import module namespace file = "http://expath.org/ns/file";
-import module namespace ddl = "http://www.zorba-xquery.com/modules/store/dynamic/collections/ddl";
-import module namespace dml = "http://www.zorba-xquery.com/modules/store/dynamic/collections/dml";
-import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
-(:import module namespace lcc = "http://www.zorba-xquery.com/scripts/link-crawler-collections" at "link_crawler_collections.xq";:)
-import module namespace tidy="http://www.zorba-xquery.com/modules/converters/html";
-import schema namespace tidy-options="http://www.zorba-xquery.com/modules/converters/html-options";
-import schema namespace httpsch = "http://expath.org/ns/http-client";
-declare namespace ann = "http://www.zorba-xquery.com/annotations";
-declare namespace xhtml="http://www.w3.org/1999/xhtml";
-declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";
-declare namespace err="http://www.w3.org/2005/xqt-errors";
-
-declare variable $result-file as xs:string external; (:PROJECT_SOURCE_DIR:)
-
-declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html"; (: the start page :)
-declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/"; (: what differentiates an internal uri :)
-
-declare variable $supported-media-types as xs:string+ := ("text/xml", "application/xml", "text/xml-external-parsed-entity", "application/xml-external-parsed-entity",
- "application/atom+xml", "text/html");
-
-declare variable $internal-uris as xs:QName := xs:QName("internal-uris");
-declare variable $external-uris as xs:QName := xs:QName("external-uris");
-
-declare variable $broken-internal as xs:QName := xs:QName("local:broken-internal");
-declare variable $broken-external as xs:QName := xs:QName("local:broken-external");
-declare variable $pages-cannot-parse as xs:QName := xs:QName("local:pages-cannot-parse");
-
-
-declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string
-{
- let $sb := fn:substring-before($s1, $s2)
- return
- if($sb = "") then
- $s1
- else
- $sb
-};
-
-
-declare %ann:sequential function local:get-uris-from-page($uri as xs:string,
- $reluri as xs:string,
- $call-from as xs:string)
-{
- variable $method;
- if(fn:starts-with($uri, $uri-host)) then
- {
- map:insert($internal-uris, $uri, $uri);
- $method := "GET";
- }
- else
- {
- map:insert($external-uris, $uri, $uri);
- $method := "HEAD";
- }
- fn:trace($uri, "");
-
- variable $load-result;
- variable $content-string;
- try{
- $load-result := http:send-request(<httpsch:request method="{$method}" href="{$uri}"/>, (), ());
- if($load-result[1]/@status eq 200) then
- if(fn:starts-with($uri, $uri-host)) then
- {
- $content-string := string($load-result[2]);
- let $media-type := local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";")
- return
- if($media-type = "text/html") then
- let $content := tidy:parse($content-string,
- <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
- <tidyParam name="output-xml" value="yes" />
- <tidyParam name="doctype" value="omit" />
- <tidyParam name="quote-nbsp" value="no" />
- <tidyParam name="char-encoding" value="utf8" />
- <tidyParam name="newline" value="LF" />
- <tidyParam name="tidy-mark" value="no" />
- <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
- </options>)
- for $other-uri2 in ($content//*:a/string(@href),
- $content//*:link/string(@href),
- $content//*:script/string(@src),
- $content//*:img/string(@src),
- $content//*:area/string(@href)
- )
- let $other-uri := fn:normalize-space($other-uri2)
- let $absuri := local:my-substring-before(fn:resolve-uri($other-uri, $uri), "#")
- return
- if(fn:not(fn:starts-with($other-uri, "#")) and
- fn:empty(map:get($internal-uris, $absuri)) and
- fn:empty(map:get($external-uris, $absuri))) then
- local:get-uris-from-page($absuri, $other-uri, $uri);
- else (:already followed this link:)
- ();
- else(: it's binary :)
- fn:trace((" has binary content ", $media-type), "");
- }
- else(:success loading external link:)
- ();
- else (: broken link :)
- if(fn:starts-with($uri, $uri-host)) then
- {
- dml:insert-nodes-last($broken-internal, <internal-broken-uri>
- <uri>{$reluri}</uri>
- <call-from>{$call-from}</call-from>
- <media-type>{local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";")}</media-type>
- </internal-broken-uri>);
- }
- else
- {
- dml:insert-nodes-last($broken-external, <external-broken-uri>
- <uri>{$uri}</uri>
- <call-from>{$call-from}</call-from>
- <media-type>{local:my-substring-before($load-result[1]/httpsch:header[@name = 'Content-Type'][1]/fn:string(@value), ";")}</media-type>
- </external-broken-uri>);
- }
- }catch ZXQP0003
- {
- dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse>
- <uri>{$uri}</uri>
- <reluri>{$reluri}</reluri>
- <call-from>{$call-from}</call-from>
- <err-code>{$err:code}</err-code>
- <err-description>{$err:description}</err-description>
- <err-value>{$err:value}</err-value>
- <err-module>{$err:module}</err-module>
- <err-line>{$err:line-number}</err-line>
- </page-cannot-parse>);
- try{ (: tidy failed to parse the html, use regex:)
- let $content := $content-string
- let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
- for $other-uri2 in $search//fn:group[@nr=8]/fn:string()
- let $other-uri := fn:normalize-space($other-uri2)
- let $absuri := local:my-substring-before(fn:resolve-uri($other-uri, $uri), "#")
- return
- if(fn:not(fn:starts-with($other-uri, "#")) and
- fn:empty(map:get($internal-uris, $absuri)) and
- fn:empty(map:get($external-uris, $absuri))) then
- local:get-uris-from-page($absuri, $other-uri, $uri);
- else (:already followed this link:)
- ();
- }catch *
- {
- dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse>
- <uri>{$uri}</uri>
- <reluri>{$reluri}</reluri>
- <call-from>{$call-from}</call-from>
- <err-code>{$err:code}</err-code>
- <err-description>{$err:description}</err-description>
- <err-value>{$err:value}</err-value>
- <err-module>{$err:module}</err-module>
- <err-line>{$err:line-number}</err-line>
- </page-cannot-parse>);
- }
- }catch *
- {
- dml:insert-nodes-last($pages-cannot-parse, <page-cannot-parse>
- <uri>{$uri}</uri>
- <reluri>{$reluri}</reluri>
- <call-from>{$call-from}</call-from>
- <err-code>{$err:code}</err-code>
- <err-description>{$err:description}</err-description>
- <err-value>{$err:value}</err-value>
- <err-module>{$err:module}</err-module>
- <err-line>{$err:line-number}</err-line>
- </page-cannot-parse>);
- }
-};
-
-
-
-map:create($internal-uris, xs:QName("xs:string"));
-ddl:create($broken-internal);
-map:create($external-uris, xs:QName("xs:string"));
-ddl:create($broken-external);
-ddl:create($pages-cannot-parse);
-
-local:get-uris-from-page($top-uri, $top-uri, "");
-
-(:display results:)
-let $full-report :=
-<link-crawler website="{$top-uri}">
-<internal-broken-uris>
-{for $i in dml:collection($broken-internal)
-let $u := $i/uri
-group by $u
-return $i}
-</internal-broken-uris>
-<pages-cannot-parse>
-{for $e in dml:collection($pages-cannot-parse)
-let $u := $e/uri
-group by $u
-return $e}
-</pages-cannot-parse>
-<external-uris>
-{for $e in map:keys($external-uris)
-let $v := $e/attribute/@value
-group by $v
-return <external-uri>{$e}</external-uri>}
-</external-uris>
-<external-broken-uris>
-{for $e in dml:collection($broken-external)
-let $u := $e/uri
-group by $u
-return $e}
-</external-broken-uris>
-
-</link-crawler>
-
-return
-file:write($result-file,
- $full-report,
- <output:serialization-parameters>
- <output:indent value="yes"/>
- </output:serialization-parameters>)
-
Follow ups