Read 蘋果日報 on your iPad or Kindle in no time. Click download to load the free ebook on your reader.
Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.
蘋果日報
Language: zh
Requires Subscription: No, it's available as free ebook
Schedule Every morning
# -*- coding: utf-8 -*-
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class AppleDaily(BasicNewsRecipe):
title = u'蘋果日報'
__author__ = u'蘋果日報'
__publisher__ = u'蘋果日報'
description = u'蘋果日報'
masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
language = 'zh_TW'
encoding = 'UTF-8'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = False
remove_javascript = True
remove_tags_before = dict(name=['ul', 'h1'])
remove_tags_after = dict(name='form')
remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
dict(name=['script', 'noscript', 'style', 'form'])]
no_stylesheets = True
extra_css = '''
@font-face {font-family: "uming", serif, sans-serif; src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
body {margin-right: 8pt; font-family: 'uming', serif;}
h1 {font-family: 'uming', serif, sans-serif}
'''
#extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'
preprocess_regexps = [
(re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
]
def get_cover_url(self):
return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
#def get_browser(self):
#br = BasicNewsRecipe.get_browser()
#if self.username is not None and self.password is not None:
# br.open('http://www.nytimes.com/auth/login')
# br.select_form(name='login')
# br['USERID'] = self.username
# br['PASSWORD'] = self.password
# br.submit()
#return br
def preprocess_html(self, soup):
#process all the images
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
#print 'checking image: ' + iurl
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
m = p.search(iurl)
if m is not None:
iurl = 'http://' + m.group('server') + '/' + m.group('path')
#print 'working! new url: ' + iurl
tag['src'] = iurl
#else:
#print 'not good'
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
iurl = tag['href']
#print 'checking image: ' + iurl
#img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)
m = p.search(iurl)
if m is not None:
iurl = 'http://' + m.group('server') + '/' + m.group('path')
#print 'working! new url: ' + iurl
tag['href'] = iurl
#else:
#print 'not good'
return soup
def parse_index(self):
base = 'http://news.hotpot.hk/fruit'
soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')
#def feed_title(div):
# return ''.join(div.findAll(text=True, recursive=False)).strip()
articles = {}
key = None
ans = []
for div in soup.findAll('li'):
key = div.find(text=True, recursive=True);
#if key == u'豪情':
# continue;
print 'section=' + key
articles[key] = []
ans.append(key)
a = div.find('a', href=True)
if not a:
continue
url = base + '/' + a['href']
print 'url=' + url
if not articles.has_key(key):
articles[key] = []
else:
# sub page
subSoup = self.index_to_soup(url)
for subDiv in subSoup.findAll('li'):
subA = subDiv.find('a', href=True)
subTitle = subDiv.find(text=True, recursive=True)
subUrl = base + '/' + subA['href']
print 'subUrl' + subUrl
articles[key].append(
dict(title=subTitle,
url=subUrl,
date='',
description='',
content=''))
# elif div['class'] in ['story', 'story headline']:
# a = div.find('a', href=True)
# if not a:
# continue
# url = re.sub(r'\?.*', '', a['href'])
# url += '?pagewanted=all'
# title = self.tag_to_string(a, use_alt=True).strip()
# description = ''
# pubdate = strftime('%a, %d %b')
# summary = div.find(True, attrs={'class':'summary'})
# if summary:
# description = self.tag_to_string(summary, use_alt=False)
#
# feed = key if key is not None else 'Uncategorized'
# if not articles.has_key(feed):
# articles[feed] = []
# if not 'podcasts' in url:
# articles[feed].append(
# dict(title=title, url=url, date=pubdate,
# description=description,
# content=''))
# ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
return ans