Read The Independent on your iPad or Kindle in no time. Click download to load the free ebook on your reader.
Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.
Independent News - Breaking news, comment and features from The Independent newspaper
Language: en
Requires Subscription: No, it's available as free ebook
Schedule Every morning
# adapted from old recipe by Darko Miletic <darko.miletic at gmail.com>
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
class TheIndependentNew(BasicNewsRecipe):
# flag to enable/disable article graphics on business pages/some others
# eg http://www.independent.co.uk/news/world/europe/berlusconi-departure-fails-to-calm-the-markets-6259682.html
# -max dimensions can be altered using the .pictureContainer img selector in the css
_FETCH_ARTICLE_GRAPHICS = True
#Flag to enable/disable image fetching (not business)
_FETCH_IMAGES = True
#used for converting rating to stars
_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star.png'
_NO_STAR_URL = 'http://www.independent.co.uk/skins/ind/images/rating_star_grey.png'
title = u'The Independent'
__author__ = 'Will'
description = 'The latest in UK News and World News from The \
Independent. Wide range of international and local news, sports \
news, commentary and opinion pieces.Independent News - Breaking news \
that matters. Your daily comprehensive news source - The \
Independent Newspaper'
publisher = 'The Independent'
category = 'news, UK'
no_stylesheets = True
use_embedded_content = False
remove_empty_feeds = True
language = 'en_GB'
publication_type = 'newspaper'
masthead_url = 'http://www.independent.co.uk/independent.co.uk/editorial/logo/independent_Masthead.png'
encoding = 'utf-8'
remove_tags =[
dict(attrs={'id' : ['RelatedArtTag','renderBiography']}),
dict(attrs={'class' : ['autoplay','openBiogPopup']}),
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
dict(attrs={'style' : re.compile('.*')}),
]
keep_only_tags =[dict(attrs={'id':'main'})]
recursions = 0
# fixes non compliant html nesting and 'marks' article graphics links
preprocess_regexps = [
(re.compile('<span class="storyTop ">(?P<nested>.*?)</span>', re.DOTALL),
lambda match: '<div class="storyTop">' + match.group('nested') + '</div>'),
(re.compile('(<strong>.*?[Cc]lick.*?<a.*?((HERE)|([Hh]ere)).*?</strong>)', re.DOTALL),
lambda match: '<div class="article-graphic">' + match.group(0) + '</div>'),
]
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
extra_css = """
h1{font-family: Georgia,serif }
body{font-family: Verdana,Arial,Helvetica,sans-serif}
img{margin-bottom: 0.4em; display:block}
.starRating img {float: left}
.starRating {margin-top:0.4em; display: block}
.image {clear:left; font-size: x-small; color:#888888;}
.articleByTimeLocation {font-size: x-small; color:#888888;
margin-bottom:0.2em ; margin-top:0.2em ; display:block}
.subtitle {clear:left}
.column-1 h1 { color: #191919}
.column-1 h2 { color: #333333}
.column-1 h3 { color: #444444}
.column-1 p { color: #777777}
.column-1 p,a,h1,h2,h3 { margin: 0; }
.column-1 div{color:#888888; margin: 0;}
.articleContent {display: block; clear:left;}
.storyTop{}
.pictureContainer img { max-width: 400px; max-height: 400px;}
"""
oldest_article = 1
max_articles_per_feed = 100
_processed_urls = []
def get_article_url(self, article):
url = super(self.__class__,self).get_article_url(article)
title = article.get('title', None)
if title and re.search("^Video:",title):
return None
#remove duplicates
if not (url in self._processed_urls):
self._processed_urls.append(url)
else:
url = None
return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup):
#remove 'advertorial articles'
strapline = soup.find('div',attrs={'class' : re.compile('.*strapLine.*')})
if strapline:
for para in strapline.findAll('p'):
if len(para.contents) and isinstance(para.contents[0],NavigableString) \
and para.contents[0] == 'ADVERTORIAL FEATURE':
return None
items_to_extract = []
slideshow_elements = []
for item in soup.findAll(attrs={'class' : re.compile("widget.*")}):
remove = True
pattern = re.compile('((articleContent)|(title))$')
if (pattern.search(item['class'])) is not None:
remove = False
# corrections
# story content always good
pattern = re.compile('storyContent')
if (pattern.search(item['class'])) is not None:
remove = False
#images
pattern = re.compile('slideshow')
if (pattern.search(item['class'])) is not None:
if self._FETCH_IMAGES:
remove = False
slideshow_elements.append(item)
else:
remove = True
#social widgets always bad
pattern = re.compile('socialwidget')
if (pattern.search(item['class'])) is not None:
remove = True
if remove:
items_to_extract.append(item)
for item in items_to_extract:
item.extract()
items_to_extract = []
if self._FETCH_IMAGES:
for element in slideshow_elements:
for item in element.findAll('a',attrs={'href' : re.compile('.*')}):
if item.img is not None:
#use full size image
img = item.findNext('img')
img['src'] = item['href']
#insert caption if available
if img.get('title') and (len(img['title']) > 1):
tag = Tag(soup,'h3')
text = NavigableString(img['title'])
tag.insert(0,text)
#picture before text
img.extract()
item.insert(0,img)
item.insert(1,tag)
# remove link
item.name = "div"
item["class"]='image'
del item["href"]
#remove empty subtitles
"""
currently the subtitle is located in first paragraph after
sibling <h3 class="subtitle"> tag. This may be 'fixed' at
some point.
"""
subtitle = soup.find('h3',attrs={'class' : 'subtitle'})
if subtitle is not None:
subtitleText = subtitle.findNext('p')
if subtitleText is not None:
if len(subtitleText.contents[0]) <= 1 :
subtitleText.extract()
subtitle.extract()
#replace rating numbers with stars
for item in soup.findAll('div',attrs={ 'class' : 'starRating'}):
if item is not None:
soup2 = self._insertRatingStars(soup,item)
if soup2 is not None:
soup = soup2
#remove empty paragraph tags in storyTop which can leave a space
#between first paragraph and rest of story
nested_content = False
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
for item in storyTop.findAll('p'):
for nested in item:
if isinstance(nested, Tag):
nested_content = True
break
if not nested_content and item.contents is not None and len(item.contents[0]) <= 1 :
items_to_extract.append(item)
for item in items_to_extract:
item.extract()
items_to_extract = []
#remove line breaks immediately next to tags with default margins
#to prevent double line spacing and narrow columns of text
storyTop = soup.find('div',attrs={ 'class' : ['storyTop']})
self._remove_undesired_line_breaks_from_tag(storyTop,soup)
#replace article graphics link with the graphics themselves
if self._FETCH_ARTICLE_GRAPHICS:
items_to_insert = []
for item in soup.findAll('div', attrs={'class' : ['article-graphic']}):
strong = item.find('strong')
if not strong:
continue
for child in strong:
if isinstance(child,Tag):
if str(child.name) == 'a':
items_to_insert.extend(self._get_article_graphic(strong,child['href'],soup))
for item in items_to_insert:
item[0].replaceWith(item[1])
for item in items_to_extract:
item.extract()
return soup
def _get_article_graphic(self,old_item,url,soup):
items_to_insert = []
if re.search('\.jpg$',str(url)):
div = Tag(soup,'div')
div['class'] = 'pictureContainer'
img = Tag(soup,'img')
img['src'] = url
img['alt'] = 'article graphic'
div.insert(0,img)
items_to_insert.append((old_item,div,))
return items_to_insert
soup2 = self.index_to_soup(url)
for item in soup2.findAll('div',attrs={'class' : re.compile("widget picture article.*")}):
items_to_insert.append((old_item,item),)
return items_to_insert
def _insertRatingStars(self,soup,item):
if item.contents is None or len(item.contents) < 1:
return
rating = item.contents[0]
try:
rating = float(item.contents[0])
except:
print 'Could not convert decimal rating to star: malformatted float.'
return
for i in range(1,6):
star = Tag(soup,'img')
if i <= rating:
star['src'] = self._STAR_URL
else:
star['src'] = self._NO_STAR_URL
star['alt'] = 'star number ' + str(i)
item.insert(i,star)
#item.contents[0] = NavigableString('(' + str(rating) + ')')
item.contents[0] = ''
def postprocess_html(self,soup, first_fetch):
#find broken images and remove captions
items_to_extract = []
for item in soup.findAll('div', attrs={'class' : 'image'}):
img = item.findNext('img')
if img and img.get('src'):
# broken images still point to remote url
pattern = re.compile('http://www.independent.co.uk.*')
if pattern.match(img["src"]) is not None:
caption = img.findNextSibling('h3')
if caption is not None:
items_to_extract.append(caption)
items_to_extract.append(img)
for item in items_to_extract:
item.extract()
return soup
def _recurisvely_linearise_tag_tree(
self,
item,
linearised= None,
count=0,
limit = 100
):
linearised = linearised or []
count = count + 1
if count > limit:
return linearised
if not (isinstance(item,Tag)):
return linearised
for nested in item:
linearised.append(nested)
linearised = self._recurisvely_linearise_tag_tree(nested,linearised, count)
return linearised
def _get_previous_tag(self,current_index, tag_tree):
if current_index == 0:
return None
else:
return tag_tree[current_index - 1]
def _get_next_tag(self,current_index, tag_tree):
if current_index < len(tag_tree) - 1:
return tag_tree[current_index + 1]
else:
return None
def _list_match(self,test_str, list_regex):
for regex in list_regex:
match = re.match(regex, test_str)
if match is not None:
return True
return False
def _remove_undesired_line_breaks_from_tag(self,parent,soup):
if parent is None:
return
tag_tree = self._recurisvely_linearise_tag_tree(parent)
items_to_remove = []
for item in tag_tree:
if item == u'\n':
items_to_remove.append(item)
continue;
for item in items_to_remove:
tag_tree.remove(item)
spaced_tags = [r'p', r'h\d', r'blockquote']
tags_to_extract = []
tags_to_replace = []
for (i, tag) in enumerate(tag_tree):
if isinstance(tag, Tag):
if str(tag) == '<br />':
previous_tag = self._get_previous_tag(i, tag_tree)
if isinstance(previous_tag, Tag):
previous_tag_is_spaced = previous_tag is not None\
and self._list_match(str(previous_tag.name),
spaced_tags)
else:
previous_tag_is_spaced = False
next_tag = self._get_next_tag(i, tag_tree)
if isinstance(next_tag, Tag):
next_tag_is_spaced = next_tag is not None\
and self._list_match(str(next_tag.name), spaced_tags)
else:
next_tag_is_spaced = False
if previous_tag_is_spaced or next_tag_is_spaced or i == 0\
or i == len(tag_tree) - 1:
tags_to_extract.append(tag)
else:
tags_to_replace.append((tag,NavigableString(' '),))
for pair in tags_to_replace:
pair[0].replaceWith(pair[1])
for tag in tags_to_extract:
tag.extract()
feeds = [
(u'News - UK',
u'http://www.independent.co.uk/news/uk/?service=rss'),
(u'News - World',
u'http://www.independent.co.uk/news/world/?service=rss'),
(u'News - Business',
u'http://www.independent.co.uk/news/business/?service=rss'),
(u'News - People',
u'http://www.independent.co.uk/news/people/?service=rss'),
(u'News - Science',
u'http://www.independent.co.uk/news/science/?service=rss'),
(u'News - Media',
u'http://www.independent.co.uk/news/media/?service=rss'),
(u'News - Education',
u'http://www.independent.co.uk/news/education/?service=rss'),
(u'News - Obituaries',
u'http://www.independent.co.uk/news/obituaries/?service=rss'),
(u'News - Corrections',
u'http://www.independent.co.uk/news/corrections/?service=rss'
),
(u'Opinion',
u'http://www.independent.co.uk/opinion/?service=rss'),
(u'Environment',
u'http://www.independent.co.uk/environment/?service=rss'),
(u'Sport - Athletics',
u'http://www.independent.co.uk/sport/general/athletics/?service=rss'
),
(u'Sport - Cricket',
u'http://www.independent.co.uk/sport/cricket/?service=rss'),
(u'Sport - Football',
u'http://www.independent.co.uk/sport/football/?service=rss'),
(u'Sport - Golf',
u'http://www.independent.co.uk/sport/golf/?service=rss'),
(u'Sport - Motor racing',
u'http://www.independent.co.uk/sport/motor-racing/?service=rss'
),
(u'Sport - Olympics',
u'http://www.independent.co.uk/sport/olympics/?service=rss'),
(u'Sport - Racing',
u'http://www.independent.co.uk/sport/racing/?service=rss'),
(u'Sport - Rugby League',
u'http://www.independent.co.uk/sport/general/rugby-league/?service=rss'),
(u'Sport - Rugby Union',
u'http://www.independent.co.uk/sport/rugby/rugby-union/?service=rss'
),
(u'Sport - Sailing',
u'http://www.independent.co.uk/sport/general/sailing/?service=rss'
),
(u'Sport - Tennis',
u'http://www.independent.co.uk/sport/tennis/?service=rss'),
(u'Sport - Others',
u'http://www.independent.co.uk/sport/general/others/?service=rss'
),
(u'Life & Style - Fashion',
u'http://www.independent.co.uk/life-style/fashion/?service=rss'
),
(u'Life & Style -Food & Drink',
u'http://www.independent.co.uk/life-style/food-and-drink/?service=rss'
),
(u'Life & Style - Health and Families',
u'http://www.independent.co.uk/life-style/health-and-families/?service=rss'
),
(u'Life & Style - House & Home',
u'http://www.independent.co.uk/life-style/house-and-home/'),
(u'Life & Style - History',
u'http://www.independent.co.uk/life-style/history/?service=rss'
),
(u'Life & Style - Gadgets & Tech',
u'http://www.independent.co.uk/life-style/gadgets-and-tech/?service=rss'
),
(u'Life & Style - Motoring',
u'http://www.independent.co.uk/life-style/motoring/?service=rss'
),
(u'Arts & Ents - Art',
u'http://www.independent.co.uk/arts-entertainment/art/?service=rss'
),
(u'Arts & Ents - Architecture',
u'http://www.independent.co.uk/arts-entertainment/architecture/?service=rss'
),
(u'Arts & Ents - Music',
u'http://www.independent.co.uk/arts-entertainment/music/?service=rss'
),
(u'Arts & Ents - Classical',
u'http://www.independent.co.uk/arts-entertainment/classical/?service=rss'
),
(u'Arts & Ents - Films',
u'http://www.independent.co.uk/arts-entertainment/films/?service=rss'
),
(u'Arts & Ents - TV',
u'http://www.independent.co.uk/arts-entertainment/tv/?service=rss'
),
(u'Arts & Ents - Theatre and Dance',
u'http://www.independent.co.uk/arts-entertainment/theatre-dance/?service=rss'
),
(u'Arts & Ents - Comedy',
u'http://www.independent.co.uk/arts-entertainment/comedy/?service=rss'
),
(u'Arts & Ents - Books',
u'http://www.independent.co.uk/arts-entertainment/books/?service=rss'
),
(u'Travel', u'http://www.independent.co.uk/travel/?service=rss'
),
(u'Money', u'http://www.independent.co.uk/money/?service=rss'),
(u'IndyBest',
u'http://www.independent.co.uk/extras/indybest/?service=rss'),
]