calibre-devs team mailing list archive

Thread
Date

Gwynne Dyer Articles

To: calibre-devs <calibre-devs@xxxxxxxxxxxxxxxxxxx>
From: Timothy Legge <timlegge@xxxxxxxxx>
Date: Tue, 6 Jul 2010 23:34:33 -0300

Hi

I started down the road of creating a script for Gwynne Dyer Articles.
 I got as far as being able to get the date, title and relative URL
for the articles but I may not get time to put it in a recipe for a
while.  I thought I would throw it out there in case anyone else wants
to do it before I get back to it.  The meat of the script is below but
I am attaching the actual script.

Not sure if it makes any difference but all his articles are posted as
text files.

Tim


def process_table(table, col_offset):
    rows = table.findAll('tr')
    for tr in rows:
        cols = tr.findAll('td')
        for i,td in enumerate(cols):

            if i == 0 + col_offset:
                ''.join([e for e in td.recursiveChildGenerator()
                    if isinstance(e,unicode)])
                datestring = e.string.replace('&nbsp;', '').lstrip()
                if datestring != '':
#                    print datestring
                    date = datestring + ', 2010'
                    dt = time.strptime(date, "%B %d, %Y")
                    print time.strftime("%A, %d. %B %Y", dt)

            elif i == 1 + col_offset:
                a = td.find('a')
                if a is not None:
                    title = tag_to_string(a).strip()
                    url = a['href']
                    desc = ''
                    print "Title:    " + title + "    ->    " + "URL: " + url

page = urllib2.urlopen("http://www.gwynnedyer.com/articles2010.htm";)
soup = BeautifulSoup(page)

t = soup.findAll('table')
for table in t:
    process_table(table, 0)
    process_table(table, 2)

#!/usr/bin/python

import os, time, traceback, re, urlparse, sys
import urllib2
from BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag

def tag_to_string(tag, use_alt=True, normalize_whitespace=True):
        '''
        Convenience method to take a
        `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
        `Tag` and extract the text from it recursively, including any CDATA sections
        and alt tag attributes. Return a possibly empty unicode string.

        `use_alt`: If `True` try to use the alt attribute for tags that don't
        have any textual content

        `tag`: `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
        `Tag`
        '''
        if not tag:
            return ''
        if isinstance(tag, basestring):
            return tag
        strings = []
        for item in tag.contents:
            if isinstance(item, (NavigableString, CData)):
                strings.append(item.string)
            elif isinstance(item, Tag):
                res = tag_to_string(item)
                if res:
                    strings.append(res)
                elif use_alt and item.has_key('alt'):
                    strings.append(item['alt'])
        ans = u''.join(strings)
        if normalize_whitespace:
            ans = re.sub(r'\s+', ' ', ans)
        return ans

def process_table(table, col_offset):
    rows = table.findAll('tr')
    for tr in rows:
        cols = tr.findAll('td')
        for i,td in enumerate(cols):

            if i == 0 + col_offset:
                ''.join([e for e in td.recursiveChildGenerator() 
	            if isinstance(e,unicode)])
                datestring = e.string.replace('&nbsp;', '').lstrip()
		if datestring != '':
#                    print datestring
		    date = datestring + ', 2010'
		    dt = time.strptime(date, "%B %d, %Y")
		    print time.strftime("%A, %d. %B %Y", dt)

	    elif i == 1 + col_offset:
		a = td.find('a')
                if a is not None:
                    title = tag_to_string(a).strip()
                    url = a['href']
                    desc = ''
		    print "Title:    " + title + "    ->    " + "URL: " + url

page = urllib2.urlopen("http://www.gwynnedyer.com/articles2010.htm";)
soup = BeautifulSoup(page)

t = soup.findAll('table')
for table in t:
    process_table(table, 0)
    process_table(table, 2)
#    rows = table.findAll('tr')
#    for tr in rows:
#        cols = tr.findAll('td')
#        for i,td in enumerate(cols):
#            if i == 0: # Date of first article
#                date = td.find('font') 
#		if date != None:
#		    datestring = date.string.replace('&nbsp;', '')
#		    print datestring
#            elif i == 1:
#		a = td.find('a')
#                if a is not None:
#                    title = tag_to_string(a).strip()
#                    url = a['href']
#                    desc = ''
#                    print "Title:    " + title + "    ->    " + "URL: " + url