Read 联合早报网 zaobao.com on your iPad or Kindle in no time. Click download to load the free ebook on your reader.
Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.
News from zaobao.com
Language: zh
Requires Subscription: No, it's available as free ebook
Schedule Every morning
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Pu Bo <pubo at pubolab.com>'
'''
zaobao.com
'''
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.web.feeds import feeds_from_index
class ZAOBAO(BasicNewsRecipe):
title = u'\u8054\u5408\u65e9\u62a5\u7f51 zaobao.com'
__author__ = 'Pu Bo'
description = 'News from zaobao.com'
no_stylesheets = True
recursions = 1
language = 'zh'
encoding = 'gbk'
masthead_url = 'http://www.zaobao.com/ssi/images1/zblogo_original.gif'
# multithreaded_fetch = True
keep_only_tags = [
dict(name='td', attrs={'class':'text'}),
dict(name='span', attrs={'class':'page'}),
dict(name='div', attrs={'id':'content'})
]
remove_tags = [
dict(name='table', attrs={'cellspacing':'9'}),
dict(name='fieldset'),
dict(name='div', attrs={'width':'30%'}),
]
extra_css = '\n\
@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}\n\
body{font-family: serif1, serif}\n\
.article_description{font-family: serif1, serif}\n\
p{font-family: serif1, serif}\n\
h1 {font-weight: bold; font-size: large;}\n\
h2 {font-size: large;}\n\
.title {font-size: large;}\n\
.article {font-size:medium}\n\
.navbar {font-size: small}\n\
.feed{font-size: medium}\n\
.small{font-size: small;padding-right: 8pt}\n\
.text{padding-right: 8pt}\n\
p{text-indent: 0cm}\n\
div#content{padding-right: 10pt}'
INDEXES = [
(u'\u65b0\u95fb\u56fe\u7247', u'http://www.zaobao.com/photoweb/photoweb_idx.shtml')
]
MAX_ITEMS_IN_INDEX = 10
DESC_SENSE = u'\u8054\u5408\u65e9\u62a5\u7f51'
feeds = [
(u'\u5373\u65f6\u62a5\u9053', u'http://realtime.zaobao.com/news.xml'),
(u'\u4e2d\u56fd\u65b0\u95fb', u'http://www.zaobao.com/zg/zg.xml'),
(u'\u56fd\u9645\u65b0\u95fb', u'http://www.zaobao.com/gj/gj.xml'),
(u'\u4e16\u754c\u62a5\u520a\u6587\u8403', u'http://www.zaobao.com/wencui/wencui.xml'),
(u'\u4e1c\u5357\u4e9a\u65b0\u95fb', u'http://www.zaobao.com/yx/yx.xml'),
(u'\u65b0\u52a0\u5761\u65b0\u95fb', u'http://www.zaobao.com/sp/sp.xml'),
(u'\u4eca\u65e5\u89c2\u70b9', u'http://www.zaobao.com/yl/yl.xml'),
(u'\u4e2d\u56fd\u8d22\u7ecf', u'http://www.zaobao.com/cz/cz.xml'),
(u'\u72ee\u57ce\u8d22\u7ecf', u'http://www.zaobao.com/cs/cs.xml'),
(u'\u5168\u7403\u8d22\u7ecf', u'http://www.zaobao.com/cg/cg.xml'),
(u'\u65e9\u62a5\u4f53\u80b2', u'http://www.zaobao.com/ty/ty.xml'),
(u'\u65e9\u62a5\u526f\u520a', u'http://www.zaobao.com/fk/fk.xml'),
]
def preprocess_html(self, soup):
for tag in soup.findAll(name='a'):
if tag.has_key('href'):
tag_url = tag['href']
if tag_url.find('http://') != -1 and tag_url.find('zaobao.com') == -1:
del tag['href']
return soup
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
return soup
def parse_feeds(self):
self.log(_('ZAOBAO overrided parse_feeds()'))
parsed_feeds = BasicNewsRecipe.parse_feeds(self)
for id, obj in enumerate(self.INDEXES):
title, url = obj
articles = []
soup = self.index_to_soup(url)
for i, item in enumerate(soup.findAll('li')):
if i >= self.MAX_ITEMS_IN_INDEX:
break
a = item.find('a')
if a and a.has_key('href'):
a_url = a['href']
a_title = self.tag_to_string(a)
date = ''
description = ''
self.log(_('adding %s at %s')%(a_title,a_url))
articles.append({
'title':a_title,
'date':date,
'url':a_url,
'description':description
})
pfeeds = feeds_from_index([(title, articles)], oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed)
self.log(_('adding %s to feed')%(title))
for feed in pfeeds:
self.log(_('adding feed: %s')%(feed.title))
feed.description = self.DESC_SENSE
parsed_feeds.append(feed)
for a, article in enumerate(feed):
self.log(_('added article %s from %s')%(article.title, article.url))
self.log(_('added feed %s')%(feed.title))
for i, feed in enumerate(parsed_feeds):
# workaorund a strange problem: Somethimes the xml encoding is not apllied correctly by parse()
weired_encoding_detected = False
if not isinstance(feed.description, unicode) and self.encoding and feed.description:
self.log(_('Feed %s is not encoded correctly, manually replace it')%(feed.title))
feed.description = feed.description.decode(self.encoding, 'replace')
elif feed.description.find(self.DESC_SENSE) == -1 and self.encoding and feed.description:
self.log(_('Feed %s is weired encoded, manually redo all')%(feed.title))
feed.description = feed.description.encode('cp1252', 'replace').decode(self.encoding, 'replace')
weired_encoding_detected = True
for a, article in enumerate(feed):
if not isinstance(article.title, unicode) and self.encoding:
article.title = article.title.decode(self.encoding, 'replace')
if not isinstance(article.summary, unicode) and self.encoding and article.summary:
article.summary = article.summary.decode(self.encoding, 'replace')
article.text_summary = article.summary
if not isinstance(article.text_summary, unicode) and self.encoding and article.text_summary:
article.text_summary = article.text_summary.decode(self.encoding, 'replace')
article.summary = article.text_summary
if weired_encoding_detected:
if article.title:
article.title = article.title.encode('cp1252', 'replace').decode(self.encoding, 'replace')
if article.summary:
article.summary = article.summary.encode('cp1252', 'replace').decode(self.encoding, 'replace')
if article.text_summary:
article.text_summary = article.text_summary.encode('cp1252', 'replace').decode(self.encoding, 'replace')
if article.title == "Untitled article":
self.log(_('Removing empty article %s from %s')%(article.title, article.url))
# remove the article
feed.articles[a:a+1] = []
return parsed_feeds
def get_browser(self):
br = BasicNewsRecipe.get_browser()
br.addheaders.append(('Pragma', 'no-cache'))
return br