Read 蘋果日報 on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

蘋果日報

Language: zh

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  # -*- coding: utf-8 -*-
import re
from calibre.web.feeds.recipes import BasicNewsRecipe

class AppleDaily(BasicNewsRecipe):

    title       = u'蘋果日報'
    __author__  = u'蘋果日報'
    __publisher__  = u'蘋果日報'
    description = u'蘋果日報'
    masthead_url = 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'
    language = 'zh_TW'
    encoding = 'UTF-8'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = False
    remove_javascript = True
    remove_tags_before = dict(name=['ul', 'h1'])
    remove_tags_after  = dict(name='form')
    remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}),
                dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']),
                dict(name=['script', 'noscript', 'style', 'form'])]
    no_stylesheets = True
    extra_css = '''
    	@font-face {font-family: "uming", serif, sans-serif;  src: url(res:///usr/share/fonts/truetype/arphic/uming.ttc); }\n
	    body {margin-right: 8pt; font-family: 'uming', serif;}
        h1 {font-family: 'uming', serif, sans-serif}
            '''
    #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'

    preprocess_regexps = [
       (re.compile(r'img.php?server=(?P<server>[^&]+)&path=(?P<path>[^&]+).*', re.DOTALL|re.IGNORECASE),
        lambda match: 'http://' + match.group('server') + '/' + match.group('path')),
    ]

    def get_cover_url(self):
        return 'http://hk.apple.nextmedia.com/template/common/header/2009/images/atnextheader_logo_appledaily.gif'


    #def get_browser(self):
        #br = BasicNewsRecipe.get_browser()
        #if self.username is not None and self.password is not None:
        #    br.open('http://www.nytimes.com/auth/login')
        #    br.select_form(name='login')
        #    br['USERID']   = self.username
        #    br['PASSWORD'] = self.password
        #    br.submit()
        #return br

    def preprocess_html(self, soup):
        #process all the images
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            #print 'checking image: ' + iurl

            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)

            m = p.search(iurl)

            if m is not None:
                iurl = 'http://' + m.group('server') + '/' + m.group('path')
                #print 'working! new url: ' + iurl
                tag['src'] = iurl
            #else:
                #print 'not good'

        for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
            iurl = tag['href']
            #print 'checking image: ' + iurl

            #img\.php?server\=(?P<server>[^&]+)&path=(?P<path>[^&]+)
            p = re.compile(r'img\.php\?server=(?P<server>[^&]+)&path=(?P<path>[^&]+)', re.DOTALL|re.IGNORECASE)

            m = p.search(iurl)

            if m is not None:
                iurl = 'http://' + m.group('server') + '/' + m.group('path')
                #print 'working! new url: ' + iurl
                tag['href'] = iurl
            #else:
                #print 'not good'

        return soup


    def parse_index(self):
        base = 'http://news.hotpot.hk/fruit'
        soup = self.index_to_soup('http://news.hotpot.hk/fruit/index.php')

        #def feed_title(div):
        #    return ''.join(div.findAll(text=True, recursive=False)).strip()

        articles = {}
        key = None
        ans = []
        for div in soup.findAll('li'):
            key = div.find(text=True, recursive=True);
            #if key == u'豪情':
           #    continue;

            print 'section=' + key

            articles[key] = []

            ans.append(key)

            a = div.find('a', href=True)

            if not a:
                continue

            url = base + '/' + a['href']
            print 'url=' + url

            if not articles.has_key(key):
                articles[key] = []
            else:
                # sub page
                subSoup = self.index_to_soup(url)

                for subDiv in subSoup.findAll('li'):
                    subA = subDiv.find('a', href=True)
                    subTitle = subDiv.find(text=True, recursive=True)
                    subUrl = base + '/' + subA['href']

                    print 'subUrl' + subUrl

                    articles[key].append(
                        dict(title=subTitle,
                         url=subUrl,
                         date='',
                         description='',
                         content=''))


#             elif div['class'] in ['story', 'story headline']:
#                 a = div.find('a', href=True)
#                 if not a:
#                     continue
#                 url = re.sub(r'\?.*', '', a['href'])
#                 url += '?pagewanted=all'
#                 title = self.tag_to_string(a, use_alt=True).strip()
#                 description = ''
#                 pubdate = strftime('%a, %d %b')
#                 summary = div.find(True, attrs={'class':'summary'})
#                 if summary:
#                     description = self.tag_to_string(summary, use_alt=False)
#
#                 feed = key if key is not None else 'Uncategorized'
#                 if not articles.has_key(feed):
#                     articles[feed] = []
#                 if not 'podcasts' in url:
#                     articles[feed].append(
#                               dict(title=title, url=url, date=pubdate,
#                                    description=description,
#                                    content=''))
#        ans = self.sort_index_by(ans, {'The Front Page':-1, 'Dining In, Dining Out':1, 'Obituaries':2})
        ans = [(unicode(key), articles[key]) for key in ans if articles.has_key(key)]
        return ans