Read New Scientist on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Science news and science articles from New Scientist.

Language: en

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  ##
## Title:        Microwave Journal RSS recipe
## Contact:      AprilHare, Darko Miletic <darko.miletic at gmail.com>
##
## License:      GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html
## Copyright:    2008-2010, AprilHare, Darko Miletic <darko.miletic at gmail.com>
##
## Written:      2008
## Last Edited:  Jan 2012
##

'''
01-19-2012: Added GrayScale Image conversion and Duplicant article removals
'''

__license__   = 'GNU General Public License v3 - http://www.gnu.org/copyleft/gpl.html'
__copyright__ = '2008-2012, AprilHare, Darko Miletic <darko.miletic at gmail.com>'
__version__     = 'v0.5.0'
__date__        = '2012-01-19'
__author__      = 'Darko Miletic'

'''
newscientist.com
'''

import re
import urllib
from calibre.utils.magick import Image
from calibre.web.feeds.news import BasicNewsRecipe

class NewScientist(BasicNewsRecipe):
    title                 = 'New Scientist - Online News w. subscription'
    description           = 'Science news and science articles from New Scientist.'
    language              = 'en'
    publisher             = 'Reed Business Information Ltd.'
    category              = 'science news, science articles, science jobs, drugs, cancer, depression, computer software'
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    cover_url             = 'http://www.newscientist.com/currentcover.jpg'
    masthead_url          = 'http://www.newscientist.com/img/misc/ns_logo.jpg'
    encoding              = 'utf-8'
    needs_subscription    = 'optional'
    extra_css             = """
                                 body{font-family: Arial,sans-serif}
                                 img{margin-bottom: 0.8em; display: block}
                                 .quotebx{font-size: x-large; font-weight: bold; margin-right: 2em; margin-left: 2em}
                            """

    conversion_options = {
                          'comment'          : description
                        , 'tags'             : category
                        , 'publisher'        : publisher
                        , 'language'         : language
                        }
    preprocess_regexps = [(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')]

    keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})]

    # Whether to omit duplicates of articles (typically arsing when articles are indexed in
    # more than one section). If True, only the first occurance will be downloaded.
    filterDuplicates = True

    # Whether to convert images to grayscale for eInk readers.
    Convert_Grayscale = False

    url_list = []   # This list is used to check if an article had already been included.

    def get_browser(self):
        br = BasicNewsRecipe.get_browser()
        br.open('http://www.newscientist.com/')
        if self.username is not None and self.password is not None:
            br.open('https://www.newscientist.com/user/login')
            data = urllib.urlencode({ 'source':'form'
                                     ,'redirectURL':''
                                     ,'loginId':self.username
                                     ,'password':self.password
                                   })
            br.open('https://www.newscientist.com/user/login',data)
        return br

    remove_tags = [
                     dict(name='div'  , attrs={'class':['hldBd','adline','pnl','infotext' ]})
                    ,dict(name='div'  , attrs={'id'   :['compnl','artIssueInfo','artTools','comments','blgsocial','sharebtns']})
                    ,dict(name='p'    , attrs={'class':['marker','infotext'               ]})
                    ,dict(name='meta' , attrs={'name' :'description'                       })
                    ,dict(name='a'    , attrs={'rel'  :'tag'                               })
                    ,dict(name='ul'   , attrs={'class':'markerlist'                        })
                    ,dict(name=['link','base','meta','iframe','object','embed'])
                  ]
    remove_tags_after = dict(attrs={'class':['nbpcopy','comments']})
    remove_attributes = ['height','width','lang','onclick']

    feeds          = [
                        (u'Latest Headlines'        , u'http://feeds.newscientist.com/science-news'       )
                       ,(u'Magazine'                , u'http://feeds.newscientist.com/magazine'           )
                       ,(u'Health'                  , u'http://feeds.newscientist.com/health'             )
                       ,(u'Life'                    , u'http://feeds.newscientist.com/life'               )
                       ,(u'Space'                   , u'http://feeds.newscientist.com/space'              )
                       ,(u'Physics and Mathematics' , u'http://feeds.newscientist.com/physics-math'       )
                       ,(u'Environment'             , u'http://feeds.newscientist.com/environment'        )
                       ,(u'Science in Society'      , u'http://feeds.newscientist.com/science-in-society' )
                       ,(u'Tech'                    , u'http://feeds.newscientist.com/tech'               )
                     ]

    def get_article_url(self, article):
        return article.get('guid',  None)

    def print_version(self, url):
        if self.filterDuplicates:
            if url in self.url_list:
                return
        self.url_list.append(url)
        return url + '?full=true&print=true'

    def preprocess_html(self, soup):
        if soup.html.has_key('id'):
           del soup.html['id']
        for item in soup.findAll(style=True):
            del item['style']
        for item in soup.findAll(['quote','quotetext']):
            item.name='p'
        for item in soup.findAll(['xref','figref']):
            tstr = item.string
            item.replaceWith(tstr)
        for tg in soup.findAll('a'):
            if tg.string == 'Home':
                tg.parent.extract()
            else:
                if tg.string is not None:
                   tstr = tg.string
                   tg.replaceWith(tstr)
        return soup

    # Converts images to Gray Scale
    def postprocess_html(self, soup, first):
        if self.Convert_Grayscale:
            #process all the images
            for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
                iurl = tag['src']
                img = Image()
                img.open(iurl)
                if img < 0:
                    raise RuntimeError('Out of memory')
                img.type = "GrayscaleType"
                img.save(iurl)
        return soup