calibre-devs team mailing list archive

Thread
Date

Initial chm2lrf implementation!

To: calibre-devs@xxxxxxxxxxxxxxxxxxx
From: "Alex Bramley" <a.bramley@xxxxxxxxx>
Date: Sun, 19 Oct 2008 17:35:22 +0100

Hello list,

Attached is a piece of code i've been messing with on-and-off for the
last couple of weekends, which uses PyCHM/CHMlib to implement a CHM ->
LRF converter. The output isn't *great* as of yet; the HTML inside CHM
files is often quite nasty (lots and lots of nested tables, no
standardisation or metadata, etc), and i've found it often just causes
my prs-505 to reboot itself. This code works as a stand-alone script
as long as calibre is installed -- just run "./chm2lrf.py -o
output.lrf mychm.chm" and leave it to do it's funky thing.

As my python is very rusty -- i'm a SysAdmin by trade, so generally
hack perl ;p -- the code could probably use some tidying, any hints or
suggestions for improvement would be much appreciated. More work is
definitely needed to clean up the HTML extracted from the CHM file,
but i'm not quite sure what kind of markup is permitted inside LRF
files, so i'm not sure where to focus effort from here onwards.
Removing tables is definitely required, I regularly get problems with
"table too large" kind of errors from html2lrf...

I've also started idling in #calibre on freenode, my nick is
"fluffle". No-one else is there full-time as of yet, though someone
did drop by temporarily. Come say hi! ;)

--alex

from __future__ import with_statement
''' CHM File decoding support '''
__license__ = 'GPL v3'
__copyright__  = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
                 ' and Alex Bramley <a.bramley at gmail.com>.'

import sys, os, re, shutil
from tempfile import mkdtemp
from mimetypes import guess_type as guess_mimetype
from htmlentitydefs import name2codepoint
from pprint import PrettyPrinter

from BeautifulSoup import BeautifulSoup
from chm.chm import CHMFile
from chm.chmlib import (
  CHM_RESOLVE_SUCCESS, CHM_ENUMERATE_NORMAL,
  chm_enumerate, chm_retrieve_object,
)

from calibre.ebooks.lrf import option_parser as lrf_parser
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf import OPFCreator, Guide
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.lrf.html.convert_from import process_file as html_process_file

def option_parser():
    parser = lrf_parser('Usage: %prog [options] mybook.chm')
    parser.add_option(
        '-d', '--output-dir', default='.', 
        help=_('Output directory. Defaults to current directory.'))
    return parser

class CHMError(Exception):
    pass

class CHMReader(CHMFile):
    def __init__(self, input):
        CHMFile.__init__(self)
        if not self.LoadCHM(input):
            raise CHMError("Unable to open CHM file '%s'"%(input,))
        self._contents = None
        self._playorder = 0
        self._metadata = False
        self._extracted = False
        
        # we'll be creating two new files on top of the extracted stuff from
        # the CHM -- OPF metadata and NCX table of contents. Let's put them in
        # the same place as the '.hhc' file, which is the CHM TOC. 
        self.root, ext = os.path.splitext(self.topics.lstrip('/'))
        self.opf_path = self.root + ".opf"
        self.ncx_path = self.root + ".ncx"
    
    def GetMetadata(self, basedir=os.getcwdu()):
        '''Gets meta-data from the CHM file into an OPFCreator object.
Takes an optional 'basedir' argument, which is provided to the created 
meta-data objects so that they can work out relative paths.'''

        self.opf = OPFCreator(basedir, self.title)
        self.opf.title_sort = self._title_sort()
        
        # now, attempt to grab vaguely standard metadata from the "home" page.
        home = BeautifulSoup(self.GetFile(self.home))
        self._get_authors(home)
        self._get_publisher(home)
        self._get_isbn(home)
        self._get_comments(home)
        self._get_coverpath(home)

        self.opf.create_manifest(map(lambda x: (x, guess_mimetype(x)[0]), self.Contents()))
        tocsoup = BeautifulSoup(self.GetTopicsTree())
        self.toc = self._parse_toc(tocsoup.body.ul, basedir)
        # we are providing an ncx index too, so let's only put top-level
        # TOC stuff in the spine, for brevity's sake...
        self.opf.create_spine([item.href for item in self.toc if item.href])
        self.opf.set_toc(self.toc)
        self.opf.guide = self._create_guide(tocsoup, basedir)
        self._metadata = True
    
    def _title_sort(self):
        prefixes = ('a ', 'the ')
        ts = self.title
        for prefix in prefixes:
            if ts[0:len(prefix)].lower() == prefix:
                ts = ts[len(prefix):len(ts)]+", "+ts[0:len(prefix)-1]
        return ts

    def _metadata_from_table(self, soup, searchfor):
        td = soup.find('td', text=re.compile(searchfor, flags=re.I))
        if td is None:
            return None
        td = td.parent
        # there appears to be multiple ways of structuring the metadata
        # on the home page. cue some nasty special-case hacks...
        if re.match(r'^\s*'+searchfor+r'\s*$', td.renderContents(), flags=re.I):
            meta = self._detag(td.findNextSibling('td'))
            return re.sub('^:', '', meta).strip()
        else:
            meta = self._detag(td)
            return re.sub(r'^[^:]+:', '', meta).strip()
    
    def _metadata_from_span(self, soup, searchfor):
        span = soup.find('span', {'class': re.compile(searchfor, flags=re.I)})
        if span is None:
            return None
        # this metadata might need some cleaning up still :/
        return span.renderContents().strip()

    def _get_authors(self, soup):
        aut = (self._metadata_from_span(soup, r'author')
            or self._metadata_from_table(soup, r'^\s*by\s*:?\s+'))
        if aut is None:
            self.opf.authors = [u'Unknown']
            self.opf.author_sort = u''
        else:
            aut = re.split(r'\s*(?:,|and)\s*',
                re.sub(re.compile(r'^\s*by:?\s*', flags=re.I), '', aut))
            self.opf.authors = aut
            aut = aut[0].split()
            # assume sorting by first named author's surname
            # and further that surname == name.split()[-1]
            self.opf.author_sort = aut[-1] + ', ' + ' '.join(aut[0:-1])

    def _get_publisher(self, soup):
        self.opf.publisher = (self._metadata_from_span(soup, 'imprint')
            or self._metadata_from_table(soup, 'publisher'))

    def _get_isbn(self, soup):
        isbn = (self._metadata_from_span(soup, 'isbn')
            or self._metadata_from_table(soup, 'isbn'))
        self.opf.isbn = re.sub(re.compile(r'^\s*isbn\s*\:', flags=re.I), '', isbn).strip()

    def _get_comments(self, soup):
        date = (self._metadata_from_span(soup, 'cwdate')
            or self._metadata_from_table(soup, 'pub date'))
        pages = (self. _metadata_from_span(soup, 'pages')
            or self._metadata_from_table(soup, 'pages'))
        try:
            # date span can have copyright symbols in it...
            date = date.replace(u'\u00a9', '').strip()
            # and pages often comes as '(\d+ pages)'
            pages = re.search(r'\d+', pages).group(0)
            self.opf.comments = u'Published %s, %s pages.' % (date, pages)
        except AttributeError:
            self.opf.comments = u''

    def _get_coverpath(self, soup):
        self.opf.cover     = None
        try:
            self.opf.cover = soup.find('img', alt=re.compile('cover', flags=re.I))['src']
        except TypeError:
            # meeehh, no handy alt-tag goodness, try some hackery
            # the basic idea behind this is that in general, the cover image
            # has a height:width ratio of ~1.25, whereas most of the nav
            # buttons are decidedly less than that. 
            # what we do in this is work out that ratio, take 1.25 off it and 
            # save the absolute value when we sort by this value, the smallest
            # one is most likely to be the cover image, hopefully.
            r = {}
            for img in soup('img'):
                try:
                    r[abs(float(img['height'])/float(img['width'])-1.25)] = img['src']
                except KeyError:
                    # interestingly, occasionally the only image without height
                    # or width attrs is the cover...
                    r[0] = img['src']
            l = r.keys()
            l.sort()
            self.opf.cover = r[l[0]]
        # this link comes from the internal html, which is in a subdir
        if self.opf.cover is not None:
            self.opf.cover = self.root + "/" + self.opf.cover

    def _create_guide(self, soup, basedir=os.getcwdu()):
        guide = Guide()
        guide.set_basedir(basedir)
        titlepage = Guide.Reference(self.home.lstrip('/'), basedir)
        titlepage.title = u'About this E-Book'
        titlepage.type = u'title-page'
        guide.append(titlepage)
        # let's try and get useful guide things from our toc soup
        # map the guide type attribute to name and search regex
        map = {
           'toc': [u'Table of Contents', '(?:table of )?contents?'],
           'copyright-page': [u'Copyright', 'copyright'],
           'dedication': [u'Dedication', 'dedication'],
           'preface': [u'Preface', 'preface'],
           'foreword': [u'Foreword', 'foreword'],
           'acknowledgements': [u'Acknowledgements', 'acknowledgements'],
           'bibliography': [u'Bibliography', 'bibliography'],
           'index': [u'Index', 'index'],
           'glossary': [u'Glossary', 'glossary'],
           'colophon': [u'Colophon', 'colophon'],
           'text': [u'Start of Content', 'chapter 1'],
        }
        for type, name in map.items():
            obj = soup.find('param', {
                'name': 'Name',
                'value': re.compile(name[1], re.I)
            })
            if obj is None: continue
            href = obj.parent.find('param', {'name': 'Local'})['value']
            ref = Guide.Reference(href, basedir)
            ref.title = name[0]
            ref.type = type
            guide.append(ref)
        return guide

    def _parse_toc(self, ul, basedir=os.getcwdu()):
        toc = TOC(play_order=self._playorder, base_path=basedir)
        self._playorder += 1
        for li in ul('li', recursive=False):
            href = li.object('param', {'name': 'Local'})[0]['value']
            if href.count('#'):
                href, frag = href.split('#')
            else:
                frag = None
            name = self._deentity(li.object('param', {'name': 'Name'})[0]['value']) 
            toc.add_item(href, frag, name, play_order=self._playorder)
            self._playorder += 1
            if li.ul:
               child = self._parse_toc(li.ul)
               child.parent = toc
               toc.append(child)
        return toc
    
    def _detag(self, tag):
        str = ""
        for elem in tag:
            if hasattr(elem, "contents"):
                str += self._detag(elem)
            else:
                str += self._deentity(elem)
        return str

    def _deentity(self, elem):
        def replace_entity(m):
            if m.group(1)=='#':
                try:
                    return unichr(int(m.group(2)))
                except ValueError:
                    return '&#%s;' % m.group(2)
            try:
                return unichr(name2codepoint[m.group(2)])
            except KeyError:
                return '&%s;' % m.group(2)
        # rargh nbsp => \xa0, not a real space
        return re.sub(r'\s+', ' ', re.sub(r'&(#?)([^;]+);', replace_entity, elem).replace(u'\u00a0', ' '))

    def GetFile(self, path):
        # have to have abs paths for ResolveObject, but Contents() deliberately
        # makes them relative. So we don't have to worry, re-add the leading /.
        if path[0] != '/':
            path = '/' + path
        res, ui = self.ResolveObject(path)
        if res != CHM_RESOLVE_SUCCESS:
            raise CHMError("Unable to locate '%s' within CHM file '%s'"%(path, self.filename))
        size, data = self.RetrieveObject(ui)
        if size == 0:
            raise CHMError("'%s' is zero bytes in length!"%(path,))
        return data

    def ExtractFiles(self, output_dir=os.getcwdu()):
        for path in self.Contents():
            lpath = os.path.join(output_dir, path)
            self._ensure_dir(lpath)
            data = self.GetFile(path)
            with open(lpath, 'wb') as f:
                if guess_mimetype(path)[0] == ('text/html'):
                    data = self._reformat(data)
                f.write(data)
        self._extracted = True

    def _reformat(self, data):
        try:
            html = BeautifulSoup(data)
        except UnicodeEncodeError:
            # hit some strange encoding problems...
            print "Unable to parse html for cleaning, leaving it :("
            return data
        # nuke javascript...
        [s.extract() for s in html('script')]
        # remove forward and back nav bars from the top/bottom of each page
        # cos they really fuck with the flow of things and generally waste space
        # since we can't use [a,b] syntax to select arbitrary items from a list
        # we'll have to do this manually...
        t = html('table')
        if t:
            if (t[0].previousSibling is None
              or t[0].previousSibling.previousSibling is None):
                t[0].extract()
            if (t[-1].nextSibling is None
              or t[-1].nextSibling.nextSibling is None):
                t[-1].extract()
        # for some very odd reason each page's content appears to be in a table
        # too. and this table has sub-tables for random asides... grr.

        # some images seem to be broken in some chm's :/
        for img in html('img'):
            try:
                # some are supposedly "relative"... lies.
                while img['src'].startswith('../'): img['src'] = img['src'][3:]
                # some have ";<junk>" at the end.
                img['src'] = img['src'].split(';')[0]
            except KeyError:
                # and some don't even have a src= ?!
                pass
        # now give back some pretty html.
        return html.prettify()

    def Contents(self):
        if self._contents is not None:
            return self._contents
        paths = []
        def get_paths(chm, ui, ctx):
            # skip directories
            if ui.path[-1] != '/':
                # and make paths relative
                paths.append(ui.path.lstrip('/'))
        chm_enumerate(self.file, CHM_ENUMERATE_NORMAL, get_paths, None)
        self._contents = paths
        return self._contents

    def _ensure_dir(self, path):
        dir = os.path.dirname(path)
        if not os.path.isdir(dir):
            os.makedirs(dir)

    def CreateMetafiles(self, output_dir=os.getcwdu()):
        if not self._metadata:
            self.GetMetadata(basedir=output_dir)
        self._ensure_dir(output_dir)
        opf_fd = open(os.path.join(output_dir, self.opf_path), 'wb')
        ncx_fd = open(os.path.join(output_dir, self.ncx_path), 'wb')
        self.opf.render(opf_fd, ncx_fd, self.ncx_path)
        opf_fd.close()
        ncx_fd.close()

    def extract_content(self, output_dir=os.getcwdu()):
        self.ExtractFiles(output_dir=output_dir)
        self.CreateMetafiles(output_dir=output_dir)

def process_file(f, options, logger):
    tdir = mkdtemp(prefix='chm2oeb_')
    f = os.path.abspath(os.path.expanduser(f))
    if not options.output:
        ext = '.lrs' if options.lrs else '.lrf'
        options.output = os.path.splitext(f)[0] + ext
    rdr = CHMReader(f)
    print "Extracting CHM to ", tdir
    rdr.extract_content(tdir)
    options.opf = os.path.join(tdir, rdr.opf_path)
    try:
        html_process_file(os.path.join(tdir, rdr.home.lstrip('/')), options)
    finally:
        try:
            shutil.rmtree(tdir)
        except:
            print "Failed to delete tempdir ", tdir


def main(args=sys.argv, logger=None):
    parser = option_parser()
    options, args = parser.parse_args(args)
    if len(args) != 2:
        parser.print_help()
        print
        print "FAIL: provide a CHM file as an argument!"
        return 1
    process_file(args[1], options, logger)
#    tdir = mkdtemp(prefix='chm2oeb_', dir='.')
#    rdr = CHMReader(args[1])
#    rdr.extract_content(tdir)
    return 0

if __name__ == '__main__':
    sys.exit(main())

Follow ups

Re: Initial chm2lrf implementation!
From: Kovid Goyal, 2008-10-22
Re: Initial chm2lrf implementation!
From: Kovid Goyal, 2008-10-19