Read Nature News on your iPad or Kindle in no time. Click download to load the free ebook on your reader.
Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.
Language: en
Requires Subscription: No, it's available as free ebook
Schedule Every morning
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
import re
class NatureNews(BasicNewsRecipe):
title = u'Nature News'
language = 'en'
__author__ = 'Krittika Goyal, Starson17'
oldest_article = 31 #days
remove_empty_feeds = True
max_articles_per_feed = 50
no_stylesheets = True
keep_only_tags = [dict(name='div', attrs={'id':'content'})]
# remove_tags_before = dict(name='h1', attrs={'class':'heading entry-title'})
# remove_tags_after = dict(name='h2', attrs={'id':'comments'})
remove_tags = [
dict(name='h2', attrs={'id':'comments'}),
dict(attrs={'alt':'Advertisement'}),
dict(name='div', attrs={'class':'ad'}),
dict(attrs={'class':'Z3988'}),
dict(attrs={'class':['formatpublished','type-of-article','cleardiv','disclaimer','buttons','comments xoxo']}),
dict(name='a', attrs={'href':'#comments'}),
dict(name='h2',attrs={'class':'subheading plusicon icon-add-comment'})
]
preprocess_regexps = [
(re.compile(r'<p>ADVERTISEMENT</p>', re.DOTALL|re.IGNORECASE), lambda match: ''),
]
extra_css = '''
.author { text-align: right; font-size: small; line-height:1em; margin-top:0px; margin-left:0; margin-right:0; margin-bottom: 0; }
.imagedescription { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; }
.imagecredit { font-size: x-small; font-style: normal; font-weight: bold}
'''
feeds = [('Nature News', 'http://feeds.nature.com/news/rss/most_recent')]
def preprocess_html(self,soup):
# The author name is slightly buried - dig it up
author = soup.find('p', {'class':'byline'})
if author:
# Find out the author's name
authornamediv = author.find('span',{'class':'author fn'})
authornamelink = authornamediv.find('a')
if authornamelink:
authorname = authornamelink.contents[0]
else:
authorname = authornamediv.contents[0]
# Stick the author's name in the byline tag
tag = Tag(soup,'div')
tag['class'] = 'author'
tag.insert(0,authorname.strip())
author.replaceWith(tag)
# Change the intro from a p to a div
intro = soup.find('p',{'class':'intro'})
if intro:
tag = Tag(soup,'div')
tag['class'] = 'intro'
tag.insert(0,intro.contents[0])
intro.replaceWith(tag)
# Change span class=imagedescription to div
descr = soup.find('span',{'class':'imagedescription'})
if descr:
tag = Tag(soup,'div')
tag['class'] = 'imagedescription'
tag.insert(0,descr.renderContents())
descr.replaceWith(tag)
# The references are in a list, let's make them simpler
reflistcont = soup.find('ul',{'id':'article-refrences'})
if reflistcont:
reflist = reflistcont.li.renderContents()
tag = Tag(soup,'div')
tag['class'] = 'article-references'
tag.insert(0,reflist)
reflistcont.replaceWith(tag)
# Within the id=content div, we need to remove all the stuff after the end of the class=entry-content
entrycontent = soup.find('div',{'class':'entry-content'})
for nextSibling in entrycontent.findNextSiblings():
nextSibling.extract()
return soup