Read Calgary Herald on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

News from Calgary, AB

Language: en

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  #!/usr/bin/env  python
# -*- coding: utf-8 -*-

__license__   = 'GPL v3'

'''
www.canada.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup


class CanWestPaper(BasicNewsRecipe):

    # un-comment the following four lines for the Victoria Times Colonist
##    title = u'Victoria Times Colonist'
##    url_prefix = 'http://www.timescolonist.com'
##    description = u'News from Victoria, BC'
##    fp_tag = 'CAN_TC'

    # un-comment the following four lines for the Vancouver Province
##    title = u'Vancouver Province'
##    url_prefix = 'http://www.theprovince.com'
##    description = u'News from Vancouver, BC'
##    fp_tag = 'CAN_VP'

    # un-comment the following four lines for the Vancouver Sun
##    title = u'Vancouver Sun'
##    url_prefix = 'http://www.vancouversun.com'
##    description = u'News from Vancouver, BC'
##    fp_tag = 'CAN_VS'

    # un-comment the following four lines for the Edmonton Journal
##    title = u'Edmonton Journal'
##    url_prefix = 'http://www.edmontonjournal.com'
##    description = u'News from Edmonton, AB'
##    fp_tag = 'CAN_EJ'

    # un-comment the following four lines for the Calgary Herald
    title = u'Calgary Herald'
    url_prefix = 'http://www.calgaryherald.com'
    description = u'News from Calgary, AB'
    fp_tag = 'CAN_CH'

    # un-comment the following four lines for the Regina Leader-Post
##    title = u'Regina Leader-Post'
##    url_prefix = 'http://www.leaderpost.com'
##    description = u'News from Regina, SK'
##    fp_tag = ''

    # un-comment the following four lines for the Saskatoon Star-Phoenix
##    title = u'Saskatoon Star-Phoenix'
##    url_prefix = 'http://www.thestarphoenix.com'
##    description = u'News from Saskatoon, SK'
##    fp_tag = ''

    # un-comment the following four lines for the Windsor Star
##    title = u'Windsor Star'
##    url_prefix = 'http://www.windsorstar.com'
##    description = u'News from Windsor, ON'
##    fp_tag = 'CAN_'

    # un-comment the following four lines for the Ottawa Citizen
##    title = u'Ottawa Citizen'
##    url_prefix = 'http://www.ottawacitizen.com'
##    description = u'News from Ottawa, ON'
##    fp_tag = 'CAN_OC'

    # un-comment the following four lines for the Montreal Gazette
##    title = u'Montreal Gazette'
##    url_prefix = 'http://www.montrealgazette.com'
##    description = u'News from Montreal, QC'
##    fp_tag = 'CAN_MG'


    language = 'en_CA'
    __author__ = 'Nick Redding'
    no_stylesheets = True
    timefmt = ' [%b %d]'
    extra_css = '''
                .timestamp {  font-size:xx-small; display: block; }
                #storyheader { font-size: medium; }
                #storyheader h1 { font-size: x-large; }
                #storyheader h2 { font-size: large;  font-style: italic; }
                .byline { font-size:xx-small; }
                #photocaption { font-size: small; font-style: italic }
                #photocredit { font-size: xx-small; }'''
    keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
    remove_tags = [{'class':'comments'},
                   dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
                   dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
                   dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
                   dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
                   dict(name='div', attrs={'class':'rule_grey_solid'}),
                   dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]

    def get_cover_url(self):
        from datetime import timedelta, date
        if self.fp_tag=='':
            return None
        cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
        br = BasicNewsRecipe.get_browser()
        daysback=1
        try:
            br.open(cover)
        except:
            while daysback<7:
                cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
                br = BasicNewsRecipe.get_browser()
                try:
                    br.open(cover)
                except:
                    daysback = daysback+1
                    continue
                break
        if daysback==7:
            self.log("\nCover unavailable")
            cover = None
        return cover

    def fixChars(self,string):
        # Replace lsquo (\x91)
        fixed = re.sub("\x91","‘",string)
        # Replace rsquo (\x92)
        fixed = re.sub("\x92","’",fixed)
        # Replace ldquo (\x93)
        fixed = re.sub("\x93","“",fixed)
        # Replace rdquo (\x94)
        fixed = re.sub("\x94","”",fixed)
        # Replace ndash (\x96)
        fixed = re.sub("\x96","–",fixed)
        # Replace mdash (\x97)
        fixed = re.sub("\x97","—",fixed)
        fixed = re.sub("&#x2019;","’",fixed)
        return fixed

    def massageNCXText(self, description):
        # Kindle TOC descriptions won't render certain characters
        if description:
            massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
            # Replace '&' with '&'
            massaged = re.sub("&","&", massaged)
            return self.fixChars(massaged)
        else:
            return description

    def populate_article_metadata(self, article, soup, first):
        if first:
            picdiv = soup.find('body').find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
        xtitle = article.text_summary.strip()
        if len(xtitle) == 0:
            desc = soup.find('meta',attrs={'property':'og:description'})
            if desc is not None:
                article.summary = article.text_summary = desc['content']

    def strip_anchors(self,soup):
        paras = soup.findAll(True)
        for para in paras:
            aTags = para.findAll('a')
            for a in aTags:
                if a.img is None:
                    a.replaceWith(a.renderContents().decode('cp1252','replace'))
        return soup

    def preprocess_html(self, soup):
        return self.strip_anchors(soup)



    def parse_index(self):
        soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')

        articles = {}
        key = 'News'
        ans = ['News']

        # Find each instance of class="sectiontitle", class="featurecontent"
        for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
                #self.log(" div class = %s" % divtag['class'])
                if divtag['class'].startswith('section_title'):
                    # div contains section title
                    if not divtag.h3:
                        continue
                    key = self.tag_to_string(divtag.h3,False)
                    ans.append(key)
                    self.log("Section name %s" % key)
                    continue
                # div contains article data
                h1tag = divtag.find('h1')
                if not h1tag:
                    continue
                atag = h1tag.find('a',href=True)
                if not atag:
                    continue
                url = self.url_prefix+'/news/todays-paper/'+atag['href']
                #self.log("Section %s" % key)
                #self.log("url %s" % url)
                title = self.tag_to_string(atag,False)
                #self.log("title %s" % title)
                pubdate = ''
                description = ''
                ptag = divtag.find('p');
                if ptag:
                    description = self.tag_to_string(ptag,False)
                    #self.log("description %s" % description)
                author = ''
                autag = divtag.find('h4')
                if autag:
                    author = self.tag_to_string(autag,False)
                    #self.log("author %s" % author)
                if not articles.has_key(key):
                    articles[key] = []
                articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))

        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans