zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #01080
[Merge] lp:~danielturcanu/zorba/my_conv_module into lp:zorba/data-converters-module
Daniel Turcanu has proposed merging lp:~danielturcanu/zorba/my_conv_module into lp:zorba/data-converters-module.
Requested reviews:
Zorba Coders (zorba-coders)
For more details, see:
https://code.launchpad.net/~danielturcanu/zorba/my_conv_module/+merge/79390
Updated link_crawler to display FROM, STATUS, RESULT for each link
--
https://code.launchpad.net/~danielturcanu/zorba/my_conv_module/+merge/79390
Your team Zorba Coders is requested to review the proposed merge of lp:~danielturcanu/zorba/my_conv_module into lp:zorba/data-converters-module.
=== modified file 'test_html/Queries/converters/html/link_crawler2.xq2'
--- test_html/Queries/converters/html/link_crawler2.xq2 2011-10-06 07:40:17 +0000
+++ test_html/Queries/converters/html/link_crawler2.xq2 2011-10-14 10:56:25 +0000
@@ -18,6 +18,7 @@
import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";
+import module namespace file = "http://expath.org/ns/file";
declare namespace ann = "http://www.zorba-xquery.com/annotations";
declare namespace xhtml="http://www.w3.org/1999/xhtml";
@@ -26,15 +27,12 @@
declare namespace httpsch = "http://expath.org/ns/http-client";
declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/site2/html/index.html";
-declare variable $uri-host as xs:string := "http://www.zorba-xquery.com/site2/";
-
-
-declare variable $supported-media-types as xs:string+ := ("text/xml", "application/xml", "text/xml-external-parsed-entity", "application/xml-external-parsed-entity",
- "application/atom+xml", "text/html");
-
-
-declare variable $local:processed-internal-links:=xs:QName("processed-internal-links");
-declare variable $local:processed-external-links :=xs:QName("processed-external-links");
+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com";
+
+
+
+declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
+declare variable $local:processed-external-links := xs:QName("processed-external-links");
declare %ann:sequential function local:create-containers()
@@ -59,9 +57,19 @@
return if($sb = "") then $s1 else $sb
};
-declare function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string
+declare %ann:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
{
- local:my-substring-before(resolve-uri($href, $start-uri), "#")
+ variable $absuri;
+ try{
+ $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
+ }
+ catch *
+ {
+ map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>,
+ <MESSAGE>malformed</MESSAGE>,
+ <RESULT>broken</RESULT>), $href);
+ }
+ $absuri
};
@@ -70,13 +78,15 @@
local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";")
};
-declare function local:alive($http-call as node()*) as xs:boolean
+declare function local:alive($http-call as item()*) as xs:boolean
{
- if(($http-call[1]/@status eq 200)) then true() else false()
+ if((count($http-call) ge 1) and
+ ($http-call[1]/@status eq 200))
+ then true() else fn:trace(false(), "alive")
};
-declare function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+declare %ann:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
{ distinct-values( for $y in ($content//*:a/string(@href),
$content//*:link/string(@href),
$content//*:script/string(@src),
@@ -87,30 +97,50 @@
};
-declare function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+declare %ann:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
distinct-values(
let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
for $other-uri2 in $search//group[@nr=8]/string()
- let $y:= fn:normalize-space($other-uri2)
- return local:get-real-link($y, $uri)
+ return local:get-real-link($other-uri2, $uri)
)
};
-
-declare %ann:sequential function local:process-external-link($x as xs:string){
+declare %ann:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*)
+{
+ if(count($http-result) ge 1)
+ then
+ map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>,
+ <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>,
+ <RESULT>{if(local:alive($http-result)) then "Ok" else "broken"}</RESULT>), $url)
+ else map:insert($map-name, <RESULT>broken</RESULT>, $url)
+};
+
+declare %ann:sequential function local:process-link($x as xs:string, $baseUri as xs:string, $n as xs:integer) as item()*{
+ if(local:is-internal($x))
+ then local:process-internal-link($x, $baseUri, $n);
+ else local:process-external-link($x, $baseUri);
+
+};
+
+declare %ann:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){
if(not(empty(map:get($local:processed-external-links, $x))))
then exit returning false();
else {}
- variable $http-call:=();
+ fn:trace($x, "HEAD external link");
+ map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x);
+ variable $http-call:=();
try{
$http-call:=http:send-request(<httpsch:request method="HEAD" href="{$x}"/>, (), ());
+ if((count($http-call) ge 1) and
+ fn:not($http-call[1]/@status eq 200)) then
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ else
+ ();
}
- catch * {}
- if( local:alive($http-call))
- then map:insert($local:processed-external-links, "clean", $x);
- else map:insert($local:processed-external-links, "broken", $x);
+ catch * { }
+ local:map-insert-result($local:processed-external-links, $x, $http-call);
};
declare function local:tidy-options()
@@ -126,61 +156,59 @@
};
-declare %ann:sequential function local:process-internal-link($x as xs:string, $n as xs:integer){
- if($n=3) then exit returning (); else {}
+declare %ann:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){
+ (: if($n=3) then exit returning (); else {} :)
if(not(empty(map:get($local:processed-internal-links, $x))))
then exit returning false();
else {}
+ fn:trace($x, "GET internal link");
+ map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x);
variable $http-call:=();
try{
- $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
}
- catch * {}
- if( not(local:alive($http-call)))
- then { map:insert($local:processed-internal-links, "broken", $x); exit returning ();}
+ catch * { }
+ if( not(local:alive($http-call)))
+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
else {}
- if(not (local:get-media-type($http-call[1]) = $supported-media-types))
- then {map:insert($local:processed-internal-links, "clean", $x); exit returning ();}
+
+ if(not (local:get-media-type($http-call[1]) = "text/html"))
+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
else {}
- variable $string-content := xs:string($http-call[2]);
+ variable $string-content := string($http-call[2]);
variable $content:=();
try{
$content:=html:parse($string-content,local:tidy-options() );
+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
}
catch *
- {
- map:insert($local:processed-internal-links, concat("cannot tidy", $err:description), $x);
+ {
+ map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>,
+ <RESULT>broken</RESULT>), $x);
try{
$content:=parse-xml:parse-xml-fragment ($string-content, "");
}
catch *
- { map:insert($local:processed-internal-links, concat("cannot parse", $err:description), $x);}
+ { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);}
}
variable $links :=();
if(empty($content))
- then $links:=local:get-out-links-unparsed($string-content, $x);
+ then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
else $links:=local:get-out-links-parsed($content, $x);
for $l in $links
- return local:process-link($l, $n+1);
-};
-
-
-
-declare %ann:sequential function local:process-link($x as xs:string, $n as xs:integer) as item()*{
- if(local:is-internal($x))
- then local:process-internal-link($x, $n);
- else local:process-external-link($x);
-
-};
+ return local:process-link($l, $x, $n+1);
+};
+
+
declare function local:print-results() as element()*
{
for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
- return <INTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-internal-links,$x)}</RESULT></INTERNAL>,
+ return <INTERNAL><LINK>{$x}</LINK>{map:get($local:processed-internal-links,$x)}</INTERNAL>,
for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
- return <EXTERNAL><LINK>{$x}</LINK><RESULT>{map:get($local:processed-external-links,$x)}</RESULT></EXTERNAL>
+ return <EXTERNAL><LINK>{$x}</LINK>{map:get($local:processed-external-links,$x)}</EXTERNAL>
};
(:==========================================
@@ -190,13 +218,15 @@
variable $result;
-try {
- local:create-containers();
- local:process-link($uri, 1);
- $result:=local:print-results() ;
-}
-catch * { $result:=concat("an error occurred", $err:description);}
-
+local:create-containers();
+local:process-link($uri, "", 1);
+$result:=local:print-results() ;
local:delete-containers();
-$result
+
+file:write(fn:resolve-uri("link_crawler_result.xml"),
+ <result>{$result}</result>,
+ <output:serialization-parameters>
+ <output:indent value="yes"/>
+ </output:serialization-parameters>)
+
Follow ups