calibre-devs team mailing list archive
-
calibre-devs team
-
Mailing list archive
-
Message #00160
Gwynne Dyer Articles
Hi
I started down the road of creating a script for Gwynne Dyer Articles.
I got as far as being able to get the date, title and relative URL
for the articles but I may not get time to put it in a recipe for a
while. I thought I would throw it out there in case anyone else wants
to do it before I get back to it. The meat of the script is below but
I am attaching the actual script.
Not sure if it makes any difference but all his articles are posted as
text files.
Tim
def process_table(table, col_offset):
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
for i,td in enumerate(cols):
if i == 0 + col_offset:
''.join([e for e in td.recursiveChildGenerator()
if isinstance(e,unicode)])
datestring = e.string.replace(' ', '').lstrip()
if datestring != '':
# print datestring
date = datestring + ', 2010'
dt = time.strptime(date, "%B %d, %Y")
print time.strftime("%A, %d. %B %Y", dt)
elif i == 1 + col_offset:
a = td.find('a')
if a is not None:
title = tag_to_string(a).strip()
url = a['href']
desc = ''
print "Title: " + title + " -> " + "URL: " + url
page = urllib2.urlopen("http://www.gwynnedyer.com/articles2010.htm")
soup = BeautifulSoup(page)
t = soup.findAll('table')
for table in t:
process_table(table, 0)
process_table(table, 2)
#!/usr/bin/python
import os, time, traceback, re, urlparse, sys
import urllib2
from BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
def tag_to_string(tag, use_alt=True, normalize_whitespace=True):
'''
Convenience method to take a
`BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
`Tag` and extract the text from it recursively, including any CDATA sections
and alt tag attributes. Return a possibly empty unicode string.
`use_alt`: If `True` try to use the alt attribute for tags that don't
have any textual content
`tag`: `BeautifulSoup <http://www.crummy.com/software/BeautifulSoup/documentation.html>`_
`Tag`
'''
if not tag:
return ''
if isinstance(tag, basestring):
return tag
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = tag_to_string(item)
if res:
strings.append(res)
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
ans = u''.join(strings)
if normalize_whitespace:
ans = re.sub(r'\s+', ' ', ans)
return ans
def process_table(table, col_offset):
rows = table.findAll('tr')
for tr in rows:
cols = tr.findAll('td')
for i,td in enumerate(cols):
if i == 0 + col_offset:
''.join([e for e in td.recursiveChildGenerator()
if isinstance(e,unicode)])
datestring = e.string.replace(' ', '').lstrip()
if datestring != '':
# print datestring
date = datestring + ', 2010'
dt = time.strptime(date, "%B %d, %Y")
print time.strftime("%A, %d. %B %Y", dt)
elif i == 1 + col_offset:
a = td.find('a')
if a is not None:
title = tag_to_string(a).strip()
url = a['href']
desc = ''
print "Title: " + title + " -> " + "URL: " + url
page = urllib2.urlopen("http://www.gwynnedyer.com/articles2010.htm")
soup = BeautifulSoup(page)
t = soup.findAll('table')
for table in t:
process_table(table, 0)
process_table(table, 2)
# rows = table.findAll('tr')
# for tr in rows:
# cols = tr.findAll('td')
# for i,td in enumerate(cols):
# if i == 0: # Date of first article
# date = td.find('font')
# if date != None:
# datestring = date.string.replace(' ', '')
# print datestring
# elif i == 1:
# a = td.find('a')
# if a is not None:
# title = tag_to_string(a).strip()
# url = a['href']
# desc = ''
# print "Title: " + title + " -> " + "URL: " + url