Read The Independent on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Independent News - Breaking news, comment and features from The Independent newspaper

Language: en

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  # adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>

import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString


class TheIndependentNew(BasicNewsRecipe):

    # flag to enable/disable article graphics on business pages/some others
    # eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
    # -max dimensions can be altered using the .pictureContainer img selector in the css
    _FETCH_ARTICLE_GRAPHICS = True

    #Flag to enable/disable image fetching (not business)
    _FETCH_IMAGES = True


     #used for converting rating to stars
    _STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
    _NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'


    title                   = u'The Independent'
    __author__              = 'Will'
    description             = 'The latest in UK News and World News from The \
                               Independent. Wide range of international and local news, sports \
                               news, commentary and opinion pieces.Independent News - Breaking news \
                               that matters. Your daily comprehensive news source - The \
                               Independent Newspaper'
    publisher               = 'The Independent'
    category                = 'news, UK'
    no_stylesheets          = True
    use_embedded_content    = False
    remove_empty_feeds      = True
    language                = 'en_GB'
    publication_type        = 'newspaper'
    masthead_url            = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
    encoding                = 'utf-8'
    remove_tags             =[
                               dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
                               dict(attrs={'class' : ['autoplay','openBiogPopup']}),
                               dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
                               dict(attrs={'style' : re.compile('.*')}),
                             ]

    keep_only_tags          =[dict(attrs={'id':'main'})]
    recursions = 0

    # fixes non compliant html nesting and 'marks' article graphics links
    preprocess_regexps      = [
                                (re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
                                lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
                                (re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
                                lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
                              ]


    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }

    extra_css             = """
                               h1{font-family: Georgia,serif }
                               body{font-family: Verdana,Arial,Helvetica,sans-serif}
                               img{margin-bottom: 0.4em; display:block}
                               .starRating img {float: left}
                               .starRating {margin-top:0.4em; display: block}
                               .image {clear:left; font-size: x-small; color:#888888;}
                               .articleByTimeLocation {font-size: x-small; color:#888888;
                                margin-bottom:0.2em ; margin-top:0.2em ; display:block}
                                .subtitle {clear:left}
                               .column-1 h1 { color: #191919}
                               .column-1 h2 { color: #333333}
                               .column-1 h3 { color: #444444}
                               .column-1 p { color: #777777}
                               .column-1 p,a,h1,h2,h3 { margin: 0; }
                               .column-1 div{color:#888888; margin: 0;}
                               .articleContent {display: block; clear:left;}
                               .storyTop{}
                               .pictureContainer img { max-width: 400px; max-height: 400px;}
                            """

    oldest_article = 1
    max_articles_per_feed = 100

    _processed_urls = []


    def get_article_url(self, article):
        url = super(self.__class__,self).get_article_url(article)

        title = article.get('title', None)
        if title and re.search("^Video:",title):
            return None

        #remove duplicates
        if not (url in self._processed_urls):
            self._processed_urls.append(url)
        else:
            url = None
        return url

    def populate_article_metadata(self, article, soup, first):
        if first and hasattr(self, 'add_toc_thumbnail'):
            picdiv = soup.find('img')
            if picdiv is not None:
                self.add_toc_thumbnail(article,picdiv['src'])

    def preprocess_html(self, soup):

        #remove 'advertorial articles'
        strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
        if strapline:
            for para in strapline.findAll('p'):
                if len(para.contents) and isinstance(para.contents[0],NavigableString) \
                and para.contents[0] == 'ADVERTORIAL FEATURE':
                    return None

        items_to_extract = []
        slideshow_elements = []

        for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
            remove = True
            pattern = re.compile('((articleContent)|(title))$')
            if (pattern.search(item['class'])) is not None:
                remove = False

            # corrections
            # story content always good
            pattern = re.compile('storyContent')
            if (pattern.search(item['class'])) is not None:
                remove = False

            #images
            pattern = re.compile('slideshow')
            if (pattern.search(item['class'])) is not None:
                if self._FETCH_IMAGES:
                    remove = False
                    slideshow_elements.append(item)
                else:
                    remove = True

            #social widgets always bad
            pattern = re.compile('socialwidget')
            if (pattern.search(item['class'])) is not None:
                remove = True

            if remove:
                items_to_extract.append(item)

        for item in items_to_extract:
            item.extract()

        items_to_extract = []

        if self._FETCH_IMAGES:
            for element in slideshow_elements:
                for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
                    if item.img is not None:
                        #use full size image
                        img = item.findNext('img')

                        img['src'] = item['href']

                        #insert caption if available
                        if img.get('title') and (len(img['title']) > 1):
                            tag = Tag(soup,'h3')
                            text = NavigableString(img['title'])
                            tag.insert(0,text)

                            #picture before text
                            img.extract()
                            item.insert(0,img)
                            item.insert(1,tag)

                        # remove link
                        item.name = "div"
                        item["class"]='image'
                        del item["href"]


        #remove empty subtitles
        """
        currently the subtitle is located in first paragraph after
        sibling <h3 class="subtitle"> tag. This may be 'fixed' at
        some point.
        """
        subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
        if subtitle is not None:
            subtitleText = subtitle.findNext('p')
            if subtitleText is not None:
                if len(subtitleText.contents[0]) <= 1 :
                    subtitleText.extract()
                    subtitle.extract()


        #replace rating numbers with stars
        for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
            if item is not None:
                soup2 = self._insertRatingStars(soup,item)
            if soup2 is not None:
                soup = soup2


        #remove empty paragraph tags in storyTop which can leave a space
        #between first paragraph and rest of story
        nested_content = False
        storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
        for item in storyTop.findAll('p'):
            for nested in item:
                if isinstance(nested, Tag):
                    nested_content = True
                    break
            if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
                items_to_extract.append(item)

        for item in items_to_extract:
            item.extract()

        items_to_extract = []


        #remove line breaks immediately next to tags with default margins
        #to prevent double line spacing and narrow columns of text
        storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
        self._remove_undesired_line_breaks_from_tag(storyTop,soup)


        #replace article graphics link with the graphics themselves
        if self._FETCH_ARTICLE_GRAPHICS:
            items_to_insert = []
            for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
                strong = item.find('strong')
                if not strong:
                    continue
                for child in strong:
                    if isinstance(child,Tag):
                        if str(child.name) == 'a':
                            items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))

            for item in items_to_insert:
                item[0].replaceWith(item[1])

        for item in items_to_extract:
            item.extract()

        return soup


    def _get_article_graphic(self,old_item,url,soup):

        items_to_insert = []

        if re.search('\.jpg$',str(url)):
            div = Tag(soup,'div')
            div['class'] = 'pictureContainer'
            img = Tag(soup,'img')
            img['src'] = url
            img['alt'] = 'article graphic'
            div.insert(0,img)
            items_to_insert.append((old_item,div,))
            return items_to_insert

        soup2 = self.index_to_soup(url)
        for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
            items_to_insert.append((old_item,item),)
        return items_to_insert


    def _insertRatingStars(self,soup,item):
        if item.contents is None or len(item.contents) < 1:
            return
        rating = item.contents[0]

        try:
            rating = float(item.contents[0])
        except:
            print 'Could not convert decimal rating to star: malformatted float.'
            return
        for i in range(1,6):
            star = Tag(soup,'img')
            if i <= rating:
                star['src'] = self._STAR_URL
            else:
                star['src'] = self._NO_STAR_URL
            star['alt'] = 'star number ' +  str(i)
            item.insert(i,star)
        #item.contents[0] = NavigableString('(' + str(rating) + ')')
        item.contents[0] = ''

    def postprocess_html(self,soup, first_fetch):
        #find broken images and remove captions
        items_to_extract = []
        for item in soup.findAll('div', attrs={'class' : 'image'}):
            img = item.findNext('img')
            if img and img.get('src'):
                # broken images still point to remote url
                pattern = re.compile('http://www.independent.co.uk.*')
                if pattern.match(img["src"]) is not None:
                    caption = img.findNextSibling('h3')
                    if caption is not None:
                        items_to_extract.append(caption)
                    items_to_extract.append(img)

        for item in items_to_extract:
            item.extract()
        return soup

    def _recurisvely_linearise_tag_tree(
        self,
        item,
        linearised= None,
        count=0,
        limit = 100
        ):
        linearised = linearised or []
        count = count + 1
        if count > limit:
            return linearised
        if not (isinstance(item,Tag)):
            return linearised
        for nested in item:
            linearised.append(nested)
            linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
        return linearised


    def _get_previous_tag(self,current_index, tag_tree):
        if current_index == 0:
            return None
        else:
            return tag_tree[current_index - 1]


    def _get_next_tag(self,current_index, tag_tree):
        if current_index < len(tag_tree) - 1:
            return tag_tree[current_index + 1]
        else:
            return None


    def _list_match(self,test_str, list_regex):
        for regex in list_regex:
            match = re.match(regex, test_str)
            if match is not None:
                return True
        return False

    def _remove_undesired_line_breaks_from_tag(self,parent,soup):

        if parent is None:
            return


        tag_tree = self._recurisvely_linearise_tag_tree(parent)
        items_to_remove = []


        for item in tag_tree:
            if item == u'\n':
               items_to_remove.append(item)
               continue;

        for item in items_to_remove:
            tag_tree.remove(item)


        spaced_tags = [r'p', r'h\d', r'blockquote']
        tags_to_extract = []
        tags_to_replace = []
        for (i, tag) in enumerate(tag_tree):
            if isinstance(tag, Tag):
                if str(tag) == '<br />':
                    previous_tag = self._get_previous_tag(i, tag_tree)

                    if isinstance(previous_tag, Tag):
                        previous_tag_is_spaced = previous_tag is not None\
                             and self._list_match(str(previous_tag.name),
                                spaced_tags)
                    else:
                        previous_tag_is_spaced = False

                    next_tag = self._get_next_tag(i, tag_tree)

                    if isinstance(next_tag, Tag):
                        next_tag_is_spaced = next_tag is not None\
                             and self._list_match(str(next_tag.name), spaced_tags)
                    else:
                        next_tag_is_spaced = False

                    if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
                         or i == len(tag_tree) - 1:
                        tags_to_extract.append(tag)
                    else:
                        tags_to_replace.append((tag,NavigableString(' '),))


        for pair in tags_to_replace:
            pair[0].replaceWith(pair[1])
        for tag in tags_to_extract:
            tag.extract()

    feeds = [
        (u'News - UK',
         u'http://www.independent.co.uk/news/uk/?service=rss'),
        (u'News - World',
         u'http://www.independent.co.uk/news/world/?service=rss'),
        (u'News - Business',
         u'http://www.independent.co.uk/news/business/?service=rss'),
        (u'News - People',
         u'http://www.independent.co.uk/news/people/?service=rss'),
        (u'News - Science',
         u'http://www.independent.co.uk/news/science/?service=rss'),
        (u'News - Media',
         u'http://www.independent.co.uk/news/media/?service=rss'),
        (u'News - Education',
         u'http://www.independent.co.uk/news/education/?service=rss'),
        (u'News - Obituaries',
         u'http://www.independent.co.uk/news/obituaries/?service=rss'),
        (u'News - Corrections',
         u'http://www.independent.co.uk/news/corrections/?service=rss'
         ),
        (u'Opinion',
         u'http://www.independent.co.uk/opinion/?service=rss'),
        (u'Environment',
         u'http://www.independent.co.uk/environment/?service=rss'),
        (u'Sport - Athletics',
         u'http://www.independent.co.uk/sport/general/athletics/?service=rss'
         ),
        (u'Sport - Cricket',
         u'http://www.independent.co.uk/sport/cricket/?service=rss'),
        (u'Sport - Football',
         u'http://www.independent.co.uk/sport/football/?service=rss'),
        (u'Sport - Golf',
         u'http://www.independent.co.uk/sport/golf/?service=rss'),
        (u'Sport - Motor racing',
         u'http://www.independent.co.uk/sport/motor-racing/?service=rss'
         ),
        (u'Sport - Olympics',
         u'http://www.independent.co.uk/sport/olympics/?service=rss'),
        (u'Sport - Racing',
         u'http://www.independent.co.uk/sport/racing/?service=rss'),
        (u'Sport - Rugby League',
         u'http://www.independent.co.uk/sport/general/rugby-league/?service=rss'),
        (u'Sport - Rugby Union',
         u'http://www.independent.co.uk/sport/rugby/rugby-union/?service=rss'
         ),
        (u'Sport - Sailing',
         u'http://www.independent.co.uk/sport/general/sailing/?service=rss'
         ),
        (u'Sport - Tennis',
         u'http://www.independent.co.uk/sport/tennis/?service=rss'),
        (u'Sport - Others',
         u'http://www.independent.co.uk/sport/general/others/?service=rss'
         ),
        (u'Life & Style - Fashion',
         u'http://www.independent.co.uk/life-style/fashion/?service=rss'
         ),
        (u'Life & Style -Food & Drink',
         u'http://www.independent.co.uk/life-style/food-and-drink/?service=rss'
         ),
        (u'Life & Style - Health and Families',
         u'http://www.independent.co.uk/life-style/health-and-families/?service=rss'
         ),
        (u'Life & Style - House & Home',
         u'http://www.independent.co.uk/life-style/house-and-home/'),
        (u'Life & Style - History',
         u'http://www.independent.co.uk/life-style/history/?service=rss'
         ),
        (u'Life & Style - Gadgets & Tech',
         u'http://www.independent.co.uk/life-style/gadgets-and-tech/?service=rss'
         ),
        (u'Life & Style - Motoring',
         u'http://www.independent.co.uk/life-style/motoring/?service=rss'
         ),
        (u'Arts & Ents - Art',
         u'http://www.independent.co.uk/arts-entertainment/art/?service=rss'
         ),
        (u'Arts & Ents - Architecture',
         u'http://www.independent.co.uk/arts-entertainment/architecture/?service=rss'
         ),
        (u'Arts & Ents - Music',
         u'http://www.independent.co.uk/arts-entertainment/music/?service=rss'
         ),
        (u'Arts & Ents - Classical',
         u'http://www.independent.co.uk/arts-entertainment/classical/?service=rss'
         ),
        (u'Arts & Ents - Films',
         u'http://www.independent.co.uk/arts-entertainment/films/?service=rss'
         ),
        (u'Arts & Ents - TV',
         u'http://www.independent.co.uk/arts-entertainment/tv/?service=rss'
         ),
        (u'Arts & Ents - Theatre and Dance',
         u'http://www.independent.co.uk/arts-entertainment/theatre-dance/?service=rss'
         ),
        (u'Arts & Ents - Comedy',
         u'http://www.independent.co.uk/arts-entertainment/comedy/?service=rss'
         ),
        (u'Arts & Ents - Books',
         u'http://www.independent.co.uk/arts-entertainment/books/?service=rss'
         ),
        (u'Travel', u'http://www.independent.co.uk/travel/?service=rss'
         ),
        (u'Money', u'http://www.independent.co.uk/money/?service=rss'),
        (u'IndyBest',
         u'http://www.independent.co.uk/extras/indybest/?service=rss'),
        ]