zorba-coders team mailing list archive
-
zorba-coders team
-
Mailing list archive
-
Message #24190
[Merge] lp:~zorba-coders/zorba/corrected_project_TAG_HTML_module into lp:zorba
Sorin Marian Nasoi has proposed merging lp:~zorba-coders/zorba/corrected_project_TAG_HTML_module into lp:zorba.
Commit message:
- corrected @project TAG
Requested reviews:
Sorin Marian Nasoi (sorin.marian.nasoi)
For more details, see:
https://code.launchpad.net/~zorba-coders/zorba/corrected_project_TAG_HTML_module/+merge/174346
--
https://code.launchpad.net/~zorba-coders/zorba/corrected_project_TAG_HTML_module/+merge/174346
Your team Zorba Coders is subscribed to branch lp:zorba.
=== added file 'CMakeLists.txt'
--- CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ CMakeLists.txt 2013-07-12 07:41:39 +0000
@@ -0,0 +1,38 @@
+# Copyright 2006-2010 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+
+PROJECT (zorba_html_module)
+ENABLE_TESTING ()
+INCLUDE (CTest)
+
+IF (WIN32)
+ # On Windows we use proxy modules that try to guess first the location
+ # of the required third party libraries. This will search in order in:
+ # 1. the path pointed by ZORBA_THIRD_PARTY_REQUIREMENTS
+ # 2. the Program Files directory available on the users computer
+ # 3. the PATH environment variable
+ # The logic is implemented by the macros in the ProxyFindModule.cmake module.
+ LIST (APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules/Windows")
+ENDIF (WIN32)
+LIST (APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules")
+
+FIND_PACKAGE (Zorba REQUIRED HINTS "${ZORBA_BUILD_DIR}")
+MESSAGE(STATUS "Zorba_USE_FILE ${Zorba_USE_FILE}")
+INCLUDE ("${Zorba_USE_FILE}")
+
+ADD_SUBDIRECTORY("src")
+
+DONE_DECLARING_ZORBA_URIS()
=== renamed file 'CMakeLists.txt' => 'CMakeLists.txt.moved'
=== added directory 'cmake_modules'
=== renamed directory 'cmake_modules' => 'cmake_modules.moved'
=== added file 'cmake_modules/FindLibTidy.cmake'
--- cmake_modules/FindLibTidy.cmake 1970-01-01 00:00:00 +0000
+++ cmake_modules/FindLibTidy.cmake 2013-07-12 07:41:39 +0000
@@ -0,0 +1,60 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# - Try to find the HTML Tidy lib
+#
+# LIBTIDY_FOUND - true if LIBTIDY was found
+# LIBTIDY_INCLUDE_DIRS - Directory to include to get LIBTIDY headers
+# Note: always include LIBTIDY headers as e.g.,
+# tidy/tidy.h
+# LIBTIDY_LIBRARIES - Libraries to link against for the LIBTIDY
+#
+
+
+IF (LIBTIDY_INCLUDE_DIR)
+ SET(LibTidy_FIND_QUIETLY TRUE)
+ENDIF (LIBTIDY_INCLUDE_DIR)
+
+# Look for the header file.
+FIND_PATH(LIBTIDY_INCLUDE_DIR tidy.h PATH_SUFFIXES tidy)
+MARK_AS_ADVANCED(LIBTIDY_INCLUDE_DIR)
+
+# Look for the library.
+# FIND_LIBRARY(LIBTIDY_LIBRARY NAMES tidy PATHS ${LIBTIDY_LIBRARIES})
+FIND_LIBRARY(LIBTIDY_LIBRARY NAMES tidy)
+MARK_AS_ADVANCED(LIBTIDY_LIBRARY)
+
+# INCLUDE(FindPackageHandleStandardArgs)
+# only available in cmake > 2.6
+# FIND_PACKAGE_HANDLE_STANDARD_ARGS(LIBTIDY DEFAULT_MSG LIBTIDY_INCLUDE_DIR LIBTIDY_LIBRARY)
+#IF (LIBTIDY_FOUND)
+# SET(LIBTIDY_LIBRARIES ${LIBTIDY_LIBRARY})
+# SET(LIBTIDY_INCLUDE_DIRS ${LIBTIDY_INCLUDE_DIR})
+#ELSE (LIBTIDY_FOUND)
+# SET(LIBTIDY_LIBRARIES)
+# SET(LIBTIDY_INCLUDE_DIRS)
+#ENDIF (LIBTIDY_FOUND)
+IF (LIBTIDY_INCLUDE_DIR AND LIBTIDY_LIBRARY)
+ SET(LIBTIDY_FOUND 1)
+ SET(LIBTIDY_LIBRARIES ${LIBTIDY_LIBRARY})
+ SET(LIBTIDY_INCLUDE_DIRS ${LIBTIDY_INCLUDE_DIR})
+ IF(NOT LibTidy_FIND_QUIETLY)
+ MESSAGE(STATUS "Found libtidy library : " ${LIBTIDY_LIBRARY})
+ MESSAGE(STATUS "Found libtidy include path : " ${LIBTIDY_INCLUDE_DIR})
+ ENDIF(NOT LibTidy_FIND_QUIETLY)
+ELSE (LIBTIDY_INCLUDE_DIR AND LIBTIDY_LIBRARY)
+ SET(LIBTIDY_FOUND 0)
+ SET(LIBTIDY_LIBRARIES)
+ SET(LIBTIDY_INCLUDE_DIRS)
+ENDIF (LIBTIDY_INCLUDE_DIR AND LIBTIDY_LIBRARY)
=== added directory 'cmake_modules/Windows'
=== added file 'cmake_modules/Windows/FindJansson.cmake'
--- cmake_modules/Windows/FindJansson.cmake 1970-01-01 00:00:00 +0000
+++ cmake_modules/Windows/FindJansson.cmake 2013-07-12 07:41:39 +0000
@@ -0,0 +1,30 @@
+# Copyright 2010 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# - Try to find the Jansson lib on Windows
+#
+# This is a proxy module that calls the FindJansson.cmake module. Before
+# doing that, we try to guess where Jansson might be on the user's machine.
+# The user should provide ZORBA_THIRD_PARTY_REQUIREMENTS which is a path where
+# the Jansson directory can be found. The Jansson directory must have "jansson"
+# (case insensitive) in its name.
+#
+# This module helps the Windows user to avoid providing the following two
+# variables when building Zorba:
+# -D Jansson_INCLUDE="path_to_3rd_party_dir\*jansson*\src"
+# -D Jansson_LIBRARY="path_to_3rd_party_dir\*jansson*\bin\[Release\]jansson.lib"
+#
+# See the FindLibTidy.cmake module shipped with Zorba for more information.
+
+FIND_PACKAGE_WIN32(NAME Jansson FOUND_VAR Jansson_FOUND SEARCH_NAMES jansson)
=== added file 'cmake_modules/Windows/FindLibTidy.cmake'
--- cmake_modules/Windows/FindLibTidy.cmake 1970-01-01 00:00:00 +0000
+++ cmake_modules/Windows/FindLibTidy.cmake 2013-07-12 07:41:39 +0000
@@ -0,0 +1,37 @@
+# Copyright 2010 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# - Try to find the HTML Tidy lib on Windows
+#
+# This is a proxy module that calls the FindLibTidy.cmake module. Before
+# doing that, we try to guess where LibTidy might be on the user's machine.
+# The user should provide ZORBA_THIRD_PARTY_REQUIREMENTS which is a path where
+# the LibTidy directory can be found. The LibTidy directory must have "tidy"
+# (case insensitive) in its name.
+#
+# This module helps the Windows user to avoid providing the following two
+# variables when building Zorba:
+# -D LIBTIDY_INCLUDE_DIR="path_to_3rd_party_dir\*tidy*\include"
+# -D LIBTIDY_LIBRARY="path_to_3rd_party_dir\*tidy*\lib\tidy.lib"
+#
+# See the FindLibTidy.cmake module shipped with Zorba for more information.
+
+FIND_PACKAGE_WIN32(NAME LibTidy FOUND_VAR LIBTIDY_FOUND SEARCH_NAMES tidy)
+
+IF (LIBTIDY_FOUND)
+
+ # find the needed DLL's
+ FIND_PACKAGE_DLLS_WIN32 (${FOUND_LOCATION} tidy.dll)
+
+ENDIF (LIBTIDY_FOUND)
=== added directory 'src'
=== renamed directory 'src' => 'src.moved'
=== added file 'src/CMakeLists.txt'
--- src/CMakeLists.txt 1970-01-01 00:00:00 +0000
+++ src/CMakeLists.txt 2013-07-12 07:41:39 +0000
@@ -0,0 +1,43 @@
+# Copyright 2006-2008 The FLWOR Foundation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# LibTidy
+#
+IF (ZORBA_SUPPRESS_LIBTIDY)
+ MESSAGE (STATUS "ZORBA_SUPPRESS_LIBTIDY is true - not searching for LibTidy.")
+ELSE (ZORBA_SUPPRESS_LIBTIDY)
+
+ MESSAGE (STATUS "Looking for LibTidy")
+ FIND_PACKAGE (LibTidy)
+
+ IF (LIBTIDY_FOUND)
+ MESSAGE (STATUS "Found LibTidy library -- " ${LIBTIDY_LIBRARIES})
+ SET (HTML_LINK_LIBRARIES ${LIBTIDY_LIBRARIES})
+
+ INCLUDE_DIRECTORIES (${LIBTIDY_INCLUDE_DIR})
+ INCLUDE_DIRECTORIES ("html.xq.src")
+ DECLARE_ZORBA_SCHEMA (FILE "html-options.xsd"
+ URI "http://www.zorba-xquery.com/modules/converters/html-options")
+ DECLARE_ZORBA_MODULE (URI "http://www.zorba-xquery.com/modules/converters/html" VERSION 1.0 FILE "html.xq" LINK_LIBRARIES "${LIBTIDY_LIBRARIES}")
+ ADD_TEST_DIRECTORY ("${PROJECT_SOURCE_DIR}/test")
+
+ ADD_TEST(zorba_html_module/link_crawler_test_for_compilation "${ZORBA_EXE}" -f -q "${PROJECT_SOURCE_DIR}/test/Queries/link_crawler2.xq2" --compile-only)
+ ELSE (LIBTIDY_FOUND)
+ MESSAGE (STATUS "LibTidy library not found -- if you want to use HTML Tidy functionality please set LIBTIDY_INCLUDE_DIR and LIBTIDY_LIBRARIES cmake parameters.")
+ SET_PROPERTY (GLOBAL PROPERTY ZORBA_PROJECT_UNAVAILABLE 1)
+ ENDIF (LIBTIDY_FOUND)
+ENDIF (ZORBA_SUPPRESS_LIBTIDY)
+MESSAGE (STATUS "")
+
=== added file 'src/html-options.xsd'
--- src/html-options.xsd 1970-01-01 00:00:00 +0000
+++ src/html-options.xsd 2013-07-12 07:41:39 +0000
@@ -0,0 +1,40 @@
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ xmlns:ho="http://www.zorba-xquery.com/modules/converters/html-options"
+ targetNamespace="http://www.zorba-xquery.com/modules/converters/html-options"
+ elementFormDefault="qualified" attributeFormDefault="unqualified"
+ >
+<!--
+:: Copyright 2006-2008 The FLWOR Foundation.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+:: http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+::
+-->
+
+
+ <xs:element name="options">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="tidyParam" minOccurs="0" maxOccurs="unbounded">
+ <xs:complexType>
+ <xs:simpleContent>
+ <xs:extension base="xs:string">
+ <xs:attribute name="name" type="xs:string" use="required"/>
+ <xs:attribute name="value" type="xs:string" use="required"/>
+ </xs:extension>
+ </xs:simpleContent>
+ </xs:complexType>
+ </xs:element>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+</xs:schema>
\ No newline at end of file
=== added file 'src/html.xq'
--- src/html.xq 1970-01-01 00:00:00 +0000
+++ src/html.xq 2013-07-12 07:41:39 +0000
@@ -0,0 +1,130 @@
+xquery version "3.0";
+
+(:
+ : Copyright 2006-2009 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+:)
+
+(:~
+ : <p>
+ : This module provides functions to <a href="http://www.w3.org/People/Raggett/tidy/" target="_blank">tidy</a> a HTML document. <br />
+ : The functions in this module take an HTML document (a string) as parameter,
+ : tidy it in order to result in valid XHTML, and return this XHTML document as a document-node.
+ : </p>
+ :
+ :
+ : @author Sorin Nasoi
+ : @library <a href="http://tidy.sourceforge.net/">Tidy C++ Library</a>
+ : @project Zorba/Data Converters/HTML
+ :
+ :)
+module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
+
+(:~
+ : Import module for checking if html options element is validated.
+ :)
+import module namespace schema = "http://www.zorba-xquery.com/modules/schema";
+
+import schema namespace html-options = "http://www.zorba-xquery.com/modules/converters/html-options";
+
+declare namespace err = "http://ww.w3.org/2005/xqt-errors";
+
+declare namespace ver = "http://www.zorba-xquery.com/options/versioning";
+declare option ver:module-version "1.0";
+
+(:~
+ : <p>This function tidies the given HTML string and returns
+ : a valid XHTML document node.</p>
+ :
+ : <p>This functions automatically sets the following tidying parameters:
+ : <ul>
+ : <li>output-xml=yes</li>
+ : <li>doctype=omit</li>
+ : <li>quote-nbsp=no</li>
+ : <li>char-encoding=utf8</li>
+ : <li>newline=LF</li>
+ : <li>tidy-mark=no</li>
+ : </ul>
+ : </p>
+ :
+ : @param $html the HTML string to tidy
+ : @return the tidied XML document
+ :
+ : @error html:InternalError if an internal error occurred while tidying
+ : the string.
+ :
+ : @example test/Queries/tidy_2.xq
+ :)
+declare function html:parse (
+ $html as xs:string
+) as document-node()
+{
+ let $validated-options :=
+ validate {
+ <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
+ <tidyParam name="output-xml" value="yes" />
+ <tidyParam name="doctype" value="omit" />
+ <tidyParam name="quote-nbsp" value="no" />
+ <tidyParam name="char-encoding" value="utf8" />
+ <tidyParam name="newline" value="LF" />
+ <tidyParam name="tidy-mark" value="no" />
+ </options>
+ }
+ return
+ html:parse-internal($html, $validated-options)
+};
+
+(:~
+ : <p>This function tidies the given HTML string and returns
+ : a valid XHTML document node.</p>
+ :
+ : <p>The second parameter allows to specify options that
+ : configure the tidy process. This parameter is a sequence
+ : of name=value pairs. Allowed parameter names and values
+ : are documented at <a href="http://tidy.sourceforge.net/docs/quickref.html">
+ : http://tidy.sourceforge.net/docs/quickref.html</a>.</p>
+ :
+ : @param $html the HTML string to tidy
+ : @param $options a set of name and value pairs that provide options
+ : to configure the tidy process that have to be validated against the
+ : "http://www.zorba-xquery.com/modules/converters/html-options" schema.
+ : @return the tidied XHTML document node
+ :
+ : @error err:XQDY0027 if $options can not be validated against the
+ : html-options schema
+ : @error html:TidyOption if there was an error with one of the options
+ : in the $options parameter that couldn't have been caught by validating
+ : against the schema
+ : @error html:InternalError if an internal error occurred while tidying
+ : the string.
+ :
+ : @example test/Queries/tidy_1.xq
+ :)
+declare function html:parse (
+ $html as xs:string,
+ $options as element(html-options:options)
+) as document-node()
+{
+ let $validated-options := if(schema:is-validated($options)) then
+ $options
+ else
+ validate { $options }
+ return
+ html:parse-internal($html, $validated-options)
+};
+
+declare %private function html:parse-internal(
+ $html as xs:string,
+ $options as element(html-options:options)
+) as document-node() external;
=== added directory 'src/html.xq.src'
=== added file 'src/html.xq.src/html.cpp'
--- src/html.xq.src/html.cpp 1970-01-01 00:00:00 +0000
+++ src/html.xq.src/html.cpp 2013-07-12 07:41:39 +0000
@@ -0,0 +1,145 @@
+/*
+ * Copyright 2006-2008 The FLWOR Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <sstream>
+
+#include <zorba/empty_sequence.h>
+#include <zorba/singleton_item_sequence.h>
+#include <zorba/item.h>
+
+#include "html.h"
+#include "tidy_wrapper.h"
+
+namespace zorba
+{
+ namespace htmlmodule
+ {
+
+//*****************************************************************************
+//*****************************************************************************
+
+ParseFunction::ParseFunction(const HtmlModule* aModule)
+: HtmlFunction(aModule)
+{
+}
+
+ItemSequence_t
+ParseFunction::evaluate(
+ const ExternalFunction::Arguments_t& aArgs,
+ const StaticContext* aSctxCtx,
+ const DynamicContext* aDynCtx) const
+ {
+ std::auto_ptr<std::istringstream> iss;
+ std::istream *is;
+ String docString;
+ Item lStringItem, lOptionsItem;
+
+ if (aArgs.size() >= 1)
+ {
+ Iterator_t lArg0Iter = aArgs[0]->getIterator();
+ lArg0Iter->open();
+ lArg0Iter->next(lStringItem);
+ lArg0Iter->close();
+ }
+
+ if ( lStringItem.isStreamable() )
+ {
+ //
+ // The "iss" auto_ptr can NOT be used since it will delete the stream that,
+ // in this case, is a data member inside another object and not dynamically
+ // allocated.
+ //
+ // We can't replace "iss" with "is" since we still need the auto_ptr for
+ // the case when the result is not streamable.
+ //
+ is = &lStringItem.getStream();
+ }
+ else
+ {
+ docString = lStringItem.getStringValue();
+ iss.reset (new std::istringstream(docString.c_str()));
+ is = iss.get();
+ }
+
+ if (aArgs.size() == 2)
+ {
+ Iterator_t lArg1Iter = aArgs[1]->getIterator();
+ lArg1Iter->open();
+ lArg1Iter->next(lOptionsItem);
+ lArg1Iter->close();
+ }
+
+ return ItemSequence_t(new SingletonItemSequence(
+ createHtmlItem( *is , lOptionsItem )));
+ }
+
+//*****************************************************************************
+//*****************************************************************************
+
+ItemFactory* HtmlModule::theFactory = 0;
+
+HtmlModule::~HtmlModule()
+{
+ for ( FuncMap_t::const_iterator lIter = theFunctions.begin();
+ lIter != theFunctions.end();
+ ++lIter)
+ {
+ delete lIter->second;
+ }
+ theFunctions.clear();
+}
+
+ExternalFunction*
+HtmlModule::getExternalFunction(const String& aLocalname)
+{
+ ExternalFunction*& lFunc = theFunctions[aLocalname];
+ if (!lFunc)
+ {
+ if (1 == 0)
+ { }
+ else if (aLocalname == "parse-internal")
+ {
+ lFunc = new ParseFunction(this);
+ }
+ }
+ return lFunc;
+}
+
+void
+HtmlModule::destroy()
+{
+ if (!dynamic_cast<HtmlModule*>(this))
+ {
+ return;
+ }
+ delete this;
+}
+//*****************************************************************************
+//*****************************************************************************
+
+ } /* namespace htmlmodule */
+} /* namespace zorba */
+
+#ifdef WIN32
+# define DLL_EXPORT __declspec(dllexport)
+#else
+# define DLL_EXPORT __attribute__ ((visibility("default")))
+#endif
+
+extern "C" DLL_EXPORT zorba::ExternalModule* createModule()
+{
+ return new zorba::htmlmodule::HtmlModule();
+}
=== added file 'src/html.xq.src/html.h'
--- src/html.xq.src/html.h 1970-01-01 00:00:00 +0000
+++ src/html.xq.src/html.h 2013-07-12 07:41:39 +0000
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2006-2008 The FLWOR Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef ZORBA_HTMLMODULE_HTML_H
+#define ZORBA_HTMLMODULE_HTML_H
+
+#include <map>
+
+#include <zorba/zorba.h>
+#include <zorba/function.h>
+#include <zorba/external_module.h>
+
+namespace zorba
+{
+ namespace htmlmodule
+ {
+//*****************************************************************************
+//*****************************************************************************
+ class HtmlModule : public ExternalModule
+ {
+ private:
+ static ItemFactory* theFactory;
+
+ protected:
+ class ltstr
+ {
+ public:
+ bool operator()(const String& s1, const String& s2) const
+ {
+ return s1.compare(s2) < 0;
+ }
+ };
+
+ typedef std::map<String, ExternalFunction*, ltstr> FuncMap_t;
+
+ FuncMap_t theFunctions;
+
+ public:
+ virtual ~HtmlModule();
+
+ virtual String
+ getURI() const { return "http://www.zorba-xquery.com/modules/converters/html"; }
+
+ virtual ExternalFunction*
+ getExternalFunction(const String& aLocalname);
+
+ virtual void
+ destroy();
+
+ static ItemFactory*
+ getItemFactory()
+ {
+ if(!theFactory)
+ {
+ theFactory = Zorba::getInstance(0)->getItemFactory();
+ }
+ return theFactory;
+ }
+ };
+
+//*****************************************************************************
+//*****************************************************************************
+ class HtmlFunction : public ContextualExternalFunction
+ {
+ protected:
+ const HtmlModule* theModule;
+ public:
+ HtmlFunction(const HtmlModule* aModule)
+ : theModule(aModule) {};
+
+ ~HtmlFunction() {};
+
+ virtual String
+ getURI() const { return theModule->getURI(); }
+
+ };
+
+//*****************************************************************************
+//*****************************************************************************
+ class ParseFunction : public HtmlFunction
+ {
+ public:
+ ParseFunction(const HtmlModule* aModule);
+
+ virtual String
+ getLocalName() const { return "parse-internal"; }
+
+ virtual ItemSequence_t
+ evaluate(const ExternalFunction::Arguments_t& args,
+ const StaticContext* aSctxCtx,
+ const DynamicContext* aDynCtx) const;
+ };
+
+
+
+
+ } /* namespace htmlmodule */
+} /* namespace zorba */
+
+#endif /* ZORBA_HTMLMODULE_HTML_H */
=== added file 'src/html.xq.src/tidy_wrapper.cpp'
--- src/html.xq.src/tidy_wrapper.cpp 1970-01-01 00:00:00 +0000
+++ src/html.xq.src/tidy_wrapper.cpp 2013-07-12 07:41:39 +0000
@@ -0,0 +1,34 @@
+/*
+ * Copyright 2006-2008 The FLWOR Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+#include <tidy.h>
+#include <buffio.h>
+
+#include "tidy_wrapper.h"
+
+#include <zorba/item_factory.h>
+#include <zorba/item.h>
+#include <zorba/xmldatamanager.h>
+
+namespace zorba
+{
+ namespace htmlmodule
+ {
+
+ } /* namespace htmlmodule */
+} /* namespace zorba */
\ No newline at end of file
=== added file 'src/html.xq.src/tidy_wrapper.h'
--- src/html.xq.src/tidy_wrapper.h 1970-01-01 00:00:00 +0000
+++ src/html.xq.src/tidy_wrapper.h 2013-07-12 07:41:39 +0000
@@ -0,0 +1,200 @@
+/*
+ * Copyright 2006-2008 The FLWOR Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ZORBA_HTMLMODULE_TIDY_WRAPPER_H
+#define ZORBA_HTMLMODULE_TIDY_WRAPPER_H
+
+#include <iostream>
+#include <sstream>
+
+#include <tidy.h>
+#include <buffio.h>
+
+#include <zorba/item_factory.h>
+#include <zorba/item.h>
+#include <zorba/iterator.h>
+#include <zorba/store_consts.h>
+#include <zorba/xmldatamanager.h>
+#include <zorba/error.h>
+#include <zorba/diagnostic_list.h>
+#include <zorba/user_exception.h>
+#include <zorba/zorba.h>
+
+namespace zorba
+{
+ namespace htmlmodule
+ {
+ class TidyReader {
+ private:
+ std::istream* theStream;
+ // We need a buffer to support the unget function
+ std::vector<unsigned int> theBuffer;
+ public:
+ TidyReader(std::istream* aStream) : theStream(aStream) {}
+ TidyInputSource getInputSource()
+ {
+ TidyInputSource lResult;
+ lResult.sourceData = this;
+ lResult.getByte = &getByte;
+ lResult.ungetByte = &ungetByte;
+ lResult.eof = &isEof;
+ return lResult;
+ }
+
+ public: // callback functions
+ static int TIDY_CALL getByte(void* aData)
+ {
+ TidyReader* lReader = static_cast<TidyReader*>(aData);
+ if (lReader->theBuffer.empty())
+ return lReader->theStream->get();
+ else
+ {
+ int lResult = lReader->theBuffer.back();
+ lReader->theBuffer.pop_back();
+ return lResult;
+ }
+ }
+
+ static void TIDY_CALL ungetByte(void* aData, byte aByte)
+ {
+ TidyReader* lReader = static_cast<TidyReader*>(aData);
+ lReader->theBuffer.push_back(aByte);
+ }
+
+ static Bool TIDY_CALL isEof(void* aData)
+ {
+ TidyReader* lReader = static_cast<TidyReader*>(aData);
+ return lReader->theStream->eof() ? yes : no;
+ }
+ };
+
+ static void checkRC(int rc, const char* errMsg)
+ {
+ if (rc > 1)
+ {
+ zorba::Item lError = Zorba::getInstance(0)->getItemFactory()
+ ->createQName(
+ "http://www.zorba-xquery.com/modules/converters/html",
+ "InternalError");
+ throw USER_EXCEPTION(lError, errMsg );
+ }
+ }
+
+ static Bool setTidyOption(TidyDoc doc, const char* option, const char* value)
+ {
+ Bool ok = yes;
+ TidyOptionId toID = tidyOptGetIdForName(option);
+ if(toID < N_TIDY_OPTIONS)
+ {
+ ok = tidyOptSetValue(doc, toID, value);
+ if (ok != yes)
+ {
+ zorba::Item lError = Zorba::getInstance(0)->getItemFactory()
+ ->createQName(
+ "http://www.zorba-xquery.com/modules/converters/html",
+ "TidyOption");
+ std::ostringstream lErrorMsg;
+ lErrorMsg << "Error setting tidy option '" << option
+ << "' with value '" << value << "'";
+ throw USER_EXCEPTION(lError, lErrorMsg.str());
+ }
+ }
+ else
+ {
+ return no;
+ }
+ return ok;
+ }
+
+ static Bool applyOptions(TidyDoc aDoc, zorba::Item &aOptions)
+ {
+ zorba::Iterator_t lAttributes, lElements;
+ zorba::Item lAttr, lElementItem, lAttrName;
+ zorba::String lStrName, lStrValue;
+ Bool lRet = yes;
+
+ if(!aOptions.isNull())
+ {
+ lElements = aOptions.getChildren();
+ lElements->open();
+ while (lElements->next(lElementItem)
+ && lElementItem.getNodeKind () == store::StoreConsts::elementNode)
+ {
+ lAttributes = lElementItem.getAttributes();
+ lAttributes->open();
+ while (lAttributes->next(lAttr))
+ {
+ lAttr.getNodeName(lAttrName);
+ if(lAttrName.getLocalName() == "name")
+ lStrName = lAttr.getStringValue();
+ else if(lAttrName.getLocalName() == "value")
+ lStrValue = lAttr.getStringValue();
+ }
+ setTidyOption(aDoc, lStrName.c_str(), lStrValue.c_str());
+ lAttributes->close();
+ }
+ lElements->close();
+ }
+ return lRet;
+ }
+
+ static zorba::Item createHtmlItem( std::istream& aStream , zorba::Item &aOptions)
+ {
+ TidyReader lReader(&aStream);
+ TidyInputSource lInputSource = lReader.getInputSource();
+
+ TidyBuffer output;
+ tidyBufInit(&output);
+ TidyBuffer errbuf;
+ tidyBufInit(&errbuf);
+ TidyDoc tDoc = tidyCreate();
+
+ applyOptions(tDoc, aOptions);
+
+ int rc = -1;
+ rc = tidySetErrorBuffer(tDoc, &errbuf);
+ checkRC(rc, "Could not set error buffer");
+ rc = tidyParseSource(tDoc, &lInputSource);
+ checkRC(rc, "Could not parse the source");
+ rc = tidyCleanAndRepair(tDoc);
+ checkRC(rc, "Could not clean and repair");
+ rc = tidyRunDiagnostics(tDoc);
+ if ( rc > 1 )
+ rc = ( tidyOptSetBool(tDoc, TidyForceOutput, yes) ? rc : -1 );
+
+ // Tidy does not support streaming for output, it only supports
+ // something they call a "sink". Therefore we buffer it in a string.
+ rc = tidySaveBuffer(tDoc, &output);
+ checkRC(rc, "Could not save the buffer");
+ std::string lResult((char*) output.bp, output.size);
+ std::istringstream lStream(lResult);
+
+ tidyBufFree(&output);
+ tidyBufFree(&errbuf);
+ tidyRelease(tDoc);
+ XmlDataManager* lDM = Zorba::getInstance(0)->getXmlDataManager();
+ try
+ {
+ return lDM->parseXML(lStream);
+ } catch (ZorbaException&)
+ {
+ return NULL;//Zorba::getInstance(0)->getItemFactory()->createString(lResult);
+ }
+ }
+ } /* namespace htmlmodule */
+} /* namespace zorba */
+
+#endif //ZORBA_HTMLMODULE_TIDY_WRAPPER_H
=== added directory 'test'
=== renamed directory 'test' => 'test.moved'
=== added directory 'test/ExpQueryResults'
=== added file 'test/ExpQueryResults/tidy_1.xml.res'
--- test/ExpQueryResults/tidy_1.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/tidy_1.xml.res 2013-07-12 07:41:39 +0000
@@ -0,0 +1,8 @@
+<html>
+<head>
+<title>Foo</title>
+</head>
+<body>
+<p>Foo!</p>
+</body>
+</html>
\ No newline at end of file
=== added file 'test/ExpQueryResults/tidy_2.xml.res'
--- test/ExpQueryResults/tidy_2.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/tidy_2.xml.res 2013-07-12 07:41:39 +0000
@@ -0,0 +1,8 @@
+<html>
+<head>
+<title>Foo</title>
+</head>
+<body>
+<p>Foo!</p>
+</body>
+</html>
\ No newline at end of file
=== added file 'test/ExpQueryResults/tidy_3.xml.res'
--- test/ExpQueryResults/tidy_3.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/tidy_3.xml.res 2013-07-12 07:41:39 +0000
@@ -0,0 +1,41 @@
+<html>
+<head>
+<title>[ #426885 ] Definition list w/Center crashes</title>
+</head>
+<body>
+<center>
+<h1>Heading 1</h1>
+</center>
+<dl>
+<dt>
+<img src="redball.gif"/>
+<b>Term 1</b>
+</dt>
+<dt>
+<img src="redball.gif"/>
+<b>Term 2</b>
+</dt>
+<dd>
+<hr/>
+</dd>
+</dl>
+<center>
+<h1>Heading 2</h1>
+</center>
+<div style="margin-left: 2em">
+<dl>
+<dt>
+<img src="redball.gif"/>
+<b>Term 3</b>
+</dt>
+<dt>
+<img src="redball.gif"/>
+<b>Term 4</b>
+</dt>
+<dd>
+<hr/>
+</dd>
+</dl>
+</div>
+</body>
+</html>
\ No newline at end of file
=== added file 'test/ExpQueryResults/tidy_4.xml.res'
--- test/ExpQueryResults/tidy_4.xml.res 1970-01-01 00:00:00 +0000
+++ test/ExpQueryResults/tidy_4.xml.res 2013-07-12 07:41:39 +0000
@@ -0,0 +1,9 @@
+<html>
+<head>
+<title>[#427663] Line endings not supported correctly</title>
+</head>
+<body>
+<p>This is a carriage return^MThis is a Unix line-ending This is a
+DOS line ending^M</p>
+</body>
+</html>
\ No newline at end of file
=== added directory 'test/Queries'
=== added file 'test/Queries/link_crawler2.xq2'
--- test/Queries/link_crawler2.xq2 1970-01-01 00:00:00 +0000
+++ test/Queries/link_crawler2.xq2 2013-07-12 07:41:39 +0000
@@ -0,0 +1,263 @@
+(:
+ : Copyright 2006-2011 The FLWOR Foundation.
+ :
+ : Licensed under the Apache License, Version 2.0 (the "License");
+ : you may not use this file except in compliance with the License.
+ : You may obtain a copy of the License at
+ :
+ : http://www.apache.org/licenses/LICENSE-2.0
+ :
+ : Unless required by applicable law or agreed to in writing, software
+ : distributed under the License is distributed on an "AS IS" BASIS,
+ : WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ : See the License for the specific language governing permissions and
+ : limitations under the License.
+:)
+
+import module namespace http = "http://www.zorba-xquery.com/modules/http-client";
+import module namespace map = "http://www.zorba-xquery.com/modules/store/data-structures/unordered-map";
+import module namespace html = "http://www.zorba-xquery.com/modules/converters/html";
+import module namespace parse-xml = "http://www.zorba-xquery.com/modules/xml";
+import module namespace file = "http://expath.org/ns/file";
+
+declare namespace an = "http://www.zorba-xquery.com/annotations";
+declare namespace xhtml="http://www.w3.org/1999/xhtml";
+declare namespace output="http://www.w3.org/2010/xslt-xquery-serialization";
+declare namespace err="http://www.w3.org/2005/xqt-errors";
+declare namespace httpsch = "http://expath.org/ns/http-client";
+
+declare variable $top-uri as xs:string := "http://www.zorba-xquery.com/html/index/";
+declare variable $uri-host as xs:string := "http://www.zorba-xquery.com";
+
+
+
+declare variable $local:processed-internal-links := xs:QName("processed-internal-links");
+declare variable $local:processed-external-links := xs:QName("processed-external-links");
+declare variable $local:tidy-options := <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
+ <tidyParam name="output-xml" value="yes" />
+ <tidyParam name="doctype" value="omit" />
+ <tidyParam name="quote-nbsp" value="no" />
+ <tidyParam name="char-encoding" value="utf8" />
+ <tidyParam name="newline" value="LF" />
+ <tidyParam name="tidy-mark" value="no" />
+ <tidyParam name="new-inline-tags" value="nav header section article footer xqdoc:custom d c options json-param" />
+ </options>;
+
+
+
+declare %an:sequential function local:create-containers()
+{
+ map:create($local:processed-internal-links, xs:QName("xs:string"));
+ map:create($local:processed-external-links, xs:QName("xs:string"));
+};
+
+declare %an:sequential function local:delete-containers(){
+ for $x in map:available-maps()
+ return map:delete($x);
+};
+
+declare function local:is-internal($x as xs:string) as xs:boolean
+{
+ starts-with($x, $uri-host)
+};
+
+declare function local:my-substring-before($s1 as xs:string, $s2 as xs:string) as xs:string
+{
+let $sb := fn:substring-before($s1, $s2)
+return if($sb = "") then $s1 else $sb
+};
+
+declare %an:sequential function local:get-real-link($href as xs:string, $start-uri as xs:string) as xs:string?
+{
+ variable $absuri;
+ try{
+ $absuri := local:my-substring-before(resolve-uri(fn:normalize-space($href), $start-uri), "#");
+ }
+ catch *
+ {
+ map:insert($local:processed-external-links, (<FROM>{$start-uri}</FROM>,
+ <MESSAGE>malformed</MESSAGE>,
+ <RESULT>broken</RESULT>), $href);
+ }
+ $absuri
+};
+
+
+declare function local:get-media-type ($http-call as node()) as xs:string
+{
+ local:my-substring-before($http-call/httpsch:header[@name = 'Content-Type'][1]/string(@value), ";")
+};
+
+declare function local:alive($http-call as item()*) as xs:boolean
+{
+ if((count($http-call) ge 1) and
+ ($http-call[1]/@status eq 200))
+ then true() else fn:trace(false(), "alive")
+};
+
+declare function local:is-redirect($http-call as item()*) as xs:boolean
+{
+ if((count($http-call) ge 1) and
+ (($http-call[1]/@status idiv 100) eq 3))
+ then fn:trace(true(), "redirect") else false()
+};
+
+
+declare %an:sequential function local:get-out-links-parsed($content as node()*, $uri as xs:string) as xs:string*
+{ distinct-values( for $y in ($content//*:a/string(@href),
+ $content//*:link/string(@href),
+ $content//*:script/string(@src),
+ $content//*:img/string(@src),
+ $content//*:area/string(@href)
+ )
+return local:get-real-link($y, $uri))
+};
+
+
+declare %an:sequential function local:get-out-links-unparsed($content as xs:string, $uri as xs:string) as xs:string*{
+
+ distinct-values(
+ let $search := fn:analyze-string($content, "(<|&lt;|<)(((a|link|area).+?href)|((script|img).+?src))=([""'])(.*?)\7")
+ for $other-uri2 in $search//group[@nr=8]/string()
+ return local:get-real-link($other-uri2, $uri)
+ )
+};
+
+
+declare %an:sequential function local:map-insert-result($map-name as xs:QName, $url as xs:string, $http-result as item()*)
+{
+ if(count($http-result) ge 1)
+ then
+ map:insert($map-name, (<STATUS>{fn:string($http-result[1]/@status)}</STATUS>,
+ <MESSAGE>{fn:string($http-result[1]/@message)}</MESSAGE>,
+ <RESULT>{if(local:alive($http-result))
+ then "Ok"
+ else if(local:is-redirect($http-result))
+ then "redirect"
+ else "broken"
+ }</RESULT>), $url);
+ else map:insert($map-name, <RESULT>broken</RESULT>, $url);
+ if(local:is-redirect($http-result)) then
+ map:insert($map-name, <REDIRECT>{fn:string($http-result[1]/httpsch:header[@name = "Location"]/@value)}</REDIRECT>, $url);
+ else {}
+};
+
+declare %an:sequential function local:process-link($x as xs:string, $baseUri as xs:string, $n as xs:integer) as item()*{
+ if(local:is-internal($x))
+ then local:process-internal-link($x, $baseUri, $n);
+ else local:process-external-link($x, $baseUri);
+
+};
+
+declare %an:sequential function local:process-external-link($x as xs:string, $baseUri as xs:string){
+ if(not(empty(map:get($local:processed-external-links, $x))))
+ then exit returning false();
+ else {}
+ fn:trace($x, "HEAD external link");
+ map:insert($local:processed-external-links, <FROM>{$baseUri}</FROM>, $x);
+ variable $http-call:=();
+ try{
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ if((count($http-call) ge 1) and
+ fn:not($http-call[1]/@status eq 200)) then
+ {
+ if(local:is-redirect($http-call)) then
+ {
+ local:map-insert-result($local:processed-external-links, $x, $http-call);
+ }
+ else {}
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ local:map-insert-result($local:processed-external-links, $x, $http-call);
+ }
+ else
+ {}
+ }
+ catch *
+ { $http-call:=();}
+ local:map-insert-result($local:processed-external-links, $x, $http-call);
+};
+
+
+declare %an:sequential function local:process-internal-link($x as xs:string, $baseUri as xs:string, $n as xs:integer){
+ (: if($n=3) then exit returning (); else {} :)
+ if(not(empty(map:get($local:processed-internal-links, $x))))
+ then exit returning false();
+ else {}
+ fn:trace($x, "GET internal link");
+ map:insert($local:processed-internal-links, <FROM>{$baseUri}</FROM>, $x);
+ variable $http-call:=();
+ try{
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}" follow-redirect="false"/>, (), ());
+ }
+ catch * { }
+ if(local:is-redirect($http-call)) then
+ {
+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
+ try{
+ $http-call:=http:send-request(<httpsch:request method="GET" href="{$x}"/>, (), ());
+ }
+ catch * { }
+ }
+ else {}
+ if( not(local:alive($http-call)))
+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
+ else {}
+
+ if(not (local:get-media-type($http-call[1]) = "text/html"))
+ then { local:map-insert-result($local:processed-internal-links, $x, $http-call); exit returning ();}
+ else {}
+ variable $string-content := string($http-call[2]);
+ variable $content:=();
+
+ try{
+ $content:=html:parse($string-content,$local:tidy-options );
+ local:map-insert-result($local:processed-internal-links, $x, $http-call);
+ }
+ catch *
+ {
+ map:insert($local:processed-internal-links, (<MESSAGE>{concat("cannot tidy: ", $err:description)}</MESSAGE>,
+ <RESULT>broken</RESULT>), $x);
+ try{
+ $content:=parse-xml:parse-xml-fragment ($string-content, "");
+ }
+ catch *
+ { map:insert($local:processed-internal-links, <MESSAGE>{concat("cannot parse: ", $err:description)}</MESSAGE>, $x);}
+ }
+ variable $links :=();
+ if(empty($content))
+ then $links:=local:get-out-links-unparsed($string-content, fn:trace($x, "parse with regex, because tidy failed"));
+ else $links:=local:get-out-links-parsed($content, $x);
+ for $l in $links
+ return local:process-link($l, $x, $n+1);
+};
+
+
+
+
+declare function local:print-results() as element()*
+{
+ for $x in map:keys($local:processed-internal-links)/map:attribute/@value/string()
+ return <INTERNAL><LINK>{$x}</LINK>{map:get($local:processed-internal-links,$x)}</INTERNAL>,
+ for $x in map:keys($local:processed-external-links)/map:attribute/@value/string()
+ return <EXTERNAL><LINK>{$x}</LINK>{map:get($local:processed-external-links,$x)}</EXTERNAL>
+};
+
+(:==========================================
+===========================================:)
+
+variable $uri:= $top-uri;
+
+variable $result;
+
+local:create-containers();
+local:process-link($uri, "", 1);
+$result:=local:print-results() ;
+
+local:delete-containers();
+
+file:write(fn:resolve-uri("link_crawler_result.xml"),
+ <result>{$result}</result>,
+ <output:serialization-parameters>
+ <output:indent value="yes"/>
+ </output:serialization-parameters>)
+
=== added file 'test/Queries/tidy_1.xq'
--- test/Queries/tidy_1.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/tidy_1.xq 2013-07-12 07:41:39 +0000
@@ -0,0 +1,14 @@
+(: tidy a html using different tidy options :)
+
+import module namespace html="http://www.zorba-xquery.com/modules/converters/html";
+import schema namespace html-options="http://www.zorba-xquery.com/modules/converters/html-options";
+
+html:parse('<title>Foo</title><p>Foo!',
+ <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
+ <tidyParam name="output-xml" value="yes" />
+ <tidyParam name="doctype" value="omit" />
+ <tidyParam name="quote-nbsp" value="no" />
+ <tidyParam name="char-encoding" value="utf8" />
+ <tidyParam name="newline" value="LF" />
+ <tidyParam name="tidy-mark" value="no" />
+ </options>)
\ No newline at end of file
=== added file 'test/Queries/tidy_2.xq'
--- test/Queries/tidy_2.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/tidy_2.xq 2013-07-12 07:41:39 +0000
@@ -0,0 +1,6 @@
+(: tidy a html using default tidy options :)
+
+import module namespace html="http://www.zorba-xquery.com/modules/converters/html";
+import schema namespace html-options="http://www.zorba-xquery.com/modules/converters/html-options";
+
+html:parse('<title>Foo</title><p>Foo!')
\ No newline at end of file
=== added file 'test/Queries/tidy_3.xq'
--- test/Queries/tidy_3.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/tidy_3.xq 2013-07-12 07:41:39 +0000
@@ -0,0 +1,18 @@
+import module namespace html="http://www.zorba-xquery.com/modules/converters/html";
+import schema namespace html-options="http://www.zorba-xquery.com/modules/converters/html-options";
+
+html:parse('<HTML>
+<HEAD>
+<TITLE>[ #426885 ] Definition list w/Center crashes</TITLE>
+</HEAD>
+<BODY>
+<CENTER><H1>Heading 1</H1></CENTER>
+<DT><IMG src="redball.gif"><B>Term 1</B></DT>
+<DT><IMG src="redball.gif"><B>Term 2</B><HR></DT>
+<CENTER><H1>Heading 2</H1></CENTER>
+<UL>
+<DT><IMG src="redball.gif"><B>Term 3</B></DT>
+<DT><IMG src="redball.gif"><B>Term 4</B><HR></DT>
+</UL>
+</BODY>
+</HTML>')
\ No newline at end of file
=== added file 'test/Queries/tidy_4.xq'
--- test/Queries/tidy_4.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/tidy_4.xq 2013-07-12 07:41:39 +0000
@@ -0,0 +1,14 @@
+import module namespace html="http://www.zorba-xquery.com/modules/converters/html";
+import schema namespace html-options="http://www.zorba-xquery.com/modules/converters/html-options";
+
+html:parse('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
+<html>
+<head>
+ <title>[#427663] Line endings not supported correctly</title>
+</head>
+<body>
+<p>This is a carriage return^MThis is a Unix line-ending
+This is a DOS line ending^M
+
+</body>
+</html>')
\ No newline at end of file
=== added file 'test/Queries/tidy_5_wrong_options.spec'
--- test/Queries/tidy_5_wrong_options.spec 1970-01-01 00:00:00 +0000
+++ test/Queries/tidy_5_wrong_options.spec 2013-07-12 07:41:39 +0000
@@ -0,0 +1,1 @@
+Error: http://www.zorba-xquery.com/modules/converters/html:TidyOption
=== added file 'test/Queries/tidy_5_wrong_options.xq'
--- test/Queries/tidy_5_wrong_options.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/tidy_5_wrong_options.xq 2013-07-12 07:41:39 +0000
@@ -0,0 +1,9 @@
+(: tidy a html using wrong tidy option for a value :)
+
+import module namespace html="http://www.zorba-xquery.com/modules/converters/html";
+import schema namespace html-options="http://www.zorba-xquery.com/modules/converters/html-options";
+
+html:parse('<title>Foo</title><p>Foo!',
+ <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
+ <tidyParam name="output-xml" value="maybe" />
+ </options>)
\ No newline at end of file
=== added file 'test/Queries/tidy_6_wrong_options.spec'
--- test/Queries/tidy_6_wrong_options.spec 1970-01-01 00:00:00 +0000
+++ test/Queries/tidy_6_wrong_options.spec 2013-07-12 07:41:39 +0000
@@ -0,0 +1,1 @@
+Error: http://www.w3.org/2005/xqt-errors:XQDY0027
=== added file 'test/Queries/tidy_6_wrong_options.xq'
--- test/Queries/tidy_6_wrong_options.xq 1970-01-01 00:00:00 +0000
+++ test/Queries/tidy_6_wrong_options.xq 2013-07-12 07:41:39 +0000
@@ -0,0 +1,14 @@
+(: tidy a html using correct tidy options/values but wrongly formated as html-options :)
+
+import module namespace html="http://www.zorba-xquery.com/modules/converters/html";
+import schema namespace html-options="http://www.zorba-xquery.com/modules/converters/html-options";
+
+html:parse('<title>Foo</title><p>Foo!',
+ <options xmlns="http://www.zorba-xquery.com/modules/converters/html-options" >
+ <tidyaram name="output-xml" value="yes" />
+ <tidyParam name="doctype" value="omit" />
+ <tidyParam name="quote-nbsp" value="no" />
+ <tidyParam name="char-encoding" value="utf8" />
+ <tidyParam name="newline" value="LF" />
+ <tidyParam name="tidy-mark" value="no" />
+ </options>)