Read The Age on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Business News, World News and Breaking News in Melbourne, Australia

Language: en

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  #!/usr/bin/env  python
__license__   = 'GPL v3'
__copyright__ = '2009, Matthew Briggs <hal.sulphur@gmail.com>'
__docformat__ = 'restructuredtext en'

'''
theage.com.au
'''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
import re

class TheAge(BasicNewsRecipe):

    title            = 'The Age'
    description      = 'Business News, World News and Breaking News in Melbourne, Australia'
    publication_type = 'newspaper'
    __author__       = 'Matthew Briggs'
    language         = 'en_AU'
    
    max_articles_per_feed = 1000
    recursions        = 0
    remove_tags       = [dict(name=['table', 'script', 'noscript', 'style']), dict(name='a', attrs={'href':'/'}), dict(name='a', attrs={'href':'/text/'})]

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.set_handle_refresh(False)
        return br

    def parse_index(self):

        soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/text/').read())

        section = None
        sections = {}

        for tag in soup.findAll(['h3', 'a']):
            if tag.name == 'h3':
                section = self.tag_to_string(tag)
                sections[section] = []

            # Make sure to skip: <a href="/">TheAge</a>

            elif section and tag.has_key('href') and len(tag['href'].strip())>1:
                url = tag['href'].strip()
                if url.startswith('/'):
                    url = 'http://www.theage.com.au' + url
                title = self.tag_to_string(tag)
                sections[section].append({
                                 'title': title,
                                 'url'  : url,
                                 'date' : strftime('%a, %d %b'),
                                 'description' : '',
                                 'content'     : '',
                                 })
                                 
        feeds = []

        # Insert feeds in specified order, if available
        
        feedSort = [ 'National', 'World', 'Opinion', 'Columns', 'Business', 'Sport', 'Entertainment' ]
        for i in feedSort:
          if i in sections:
            feeds.append((i,sections[i]))

        # Done with the sorted feeds

        for i in feedSort:
          del sections[i]
        
        # Append what is left over...

        for i in sections:
          feeds.append((i,sections[i]))
            
        return feeds

    def get_cover_url(self):

        soup = BeautifulSoup(self.browser.open('http://www.theage.com.au/todays-paper').read())

        for i in soup.findAll('a'):
          href = i['href']
          if href and re.match('http://www.theage.com.au/frontpage/[0-9]+/[0-9]+/[0-9]+/frontpage.pdf',href):
            return href

        return None

    def preprocess_html(self,soup):
        
        for p in soup.findAll('p'):
        
          # Collapse the paragraph by joining the non-tag contents

          contents = [i for i in p.contents if isinstance(i,unicode)]
          if len(contents):
            contents = ''.join(contents)

            # Filter out what's left of the text-mode navigation stuff

            if re.match('((\s)|(\&nbsp\;))*\[[\|\s*]*\]((\s)|(\&nbsp\;))*$',contents):
              p.extract()
              continue

            # Shrink the fine print font 

            if contents=='This material is subject to copyright and any unauthorised use, copying or mirroring is prohibited.':
              p['style'] = 'font-size:small'
              continue        
        
        return soup