Read Ars Technica on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Ars Technica is a technology news and information website. It publishes news, reviews and guides on issues such as computer hardware and software, science, technology policy, and video games.

Language: en

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  __license__   = 'GPL v3'
__copyright__ = '2008-2011, Darko Miletic <darko.miletic at gmail.com>'
'''
arstechnica.com
'''

import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag

class ArsTechnica(BasicNewsRecipe):
    title                 = u'Ars Technica'
    language              = 'en'
    __author__            = 'Darko Miletic, Sujata Raman, Alexis Rohou'
    description           = 'The art of technology'
    publisher             = 'Ars Technica'
    category              = 'news, IT, technology'
    oldest_article        = 5
    max_articles_per_feed = 100
    no_stylesheets        = True
    encoding              = 'utf-8'
    use_embedded_content  = False
    extra_css             = 	'''
				body {font-family: Arial,Helvetica,sans-serif}
				.title{text-align: left}
				.byline{font-weight: bold; line-height: 1em; font-size: 0.625em; text-decoration: none}
				.news-item-figure-caption-text{font-size:small; font-style:italic}
				.news-item-figure-caption-byline{font-size:small; font-style:italic; font-weight:bold}
				'''
    ignoreEtcArticles     = True	# Etc feed items can be ignored, as they're not real stories

    conversion_options = {
                             'comments'  : description
                            ,'tags'      : category
                            ,'language'  : language
                            ,'publisher' : publisher
                         }


    #preprocess_regexps = [
    #            (re.compile(r'<div class="news-item-figure', re.DOTALL|re.IGNORECASE),lambda match: '<div class="news-item-figure"')
    #           ,(re.compile(r'</title>.*?</head>', re.DOTALL|re.IGNORECASE),lambda match: '</title></head>')
    #                     ]

    keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]

    remove_tags = [
                     dict(name=['object','link','embed'])
                    ,dict(name='div', attrs={'class':'read-more-link'})
                  ]
    #remove_attributes=['width','height']

    feeds = [
              (u'Infinite Loop (Apple content)'        , u'http://feeds.arstechnica.com/arstechnica/apple/'      )
             ,(u'Opposable Thumbs (Gaming content)'    , u'http://feeds.arstechnica.com/arstechnica/gaming/'     )
             ,(u'Gear and Gadgets'                     , u'http://feeds.arstechnica.com/arstechnica/gadgets/'    )
             ,(u'Chipster (Hardware content)'          , u'http://feeds.arstechnica.com/arstechnica/hardware/'   )
             ,(u'Uptime (IT content)'                  , u'http://feeds.arstechnica.com/arstechnica/business/'   )
             ,(u'Open Ended (Open Source content)'     , u'http://feeds.arstechnica.com/arstechnica/open-source/')
             ,(u'One Microsoft Way'                    , u'http://feeds.arstechnica.com/arstechnica/microsoft/'  )
             ,(u'Nobel Intent (Science content)'       , u'http://feeds.arstechnica.com/arstechnica/science/'    )
             ,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
            ]

    # This deals with multi-page stories
    def append_page(self, soup, appendtag, position):
        pager = soup.find('div',attrs={'class':'pager'})
        if pager:
           for atag in pager.findAll('a',href=True):
               str = self.tag_to_string(atag)
               if str.startswith('Next'):
                  nurl = 'http://arstechnica.com' + atag['href']
                  rawc = self.index_to_soup(nurl,True)
                  soup2 = BeautifulSoup(rawc, fromEncoding=self.encoding)

                  readmoretag = soup2.find('div', attrs={'class':'read-more-link'})
                  if readmoretag:
                     readmoretag.extract()
                  texttag = soup2.find('div', attrs={'class':'body'})
                  for it in texttag.findAll(style=True):
                      del it['style']

                  newpos = len(texttag.contents)
                  self.append_page(soup2,texttag,newpos)
                  texttag.extract()
                  pager.extract()
                  appendtag.insert(position,texttag)


    def preprocess_html(self, soup):
	# Adds line breaks near the byline (not sure why this is needed)
        ftag = soup.find('div', attrs={'class':'byline'})
        if ftag:
           brtag = Tag(soup,'br')
           brtag2 = Tag(soup,'br')
           ftag.insert(4,brtag)
           ftag.insert(5,brtag2)

	# Remove style items
        for item in soup.findAll(style=True):
           del item['style']

	# Remove id
	for item in soup.findAll(id=True):
		del item['id']

	# For some reason, links to authors don't have the domainname
	a_author = soup.find('a',{'href':re.compile("^/author")})
	if a_author:
		a_author['href'] = 'http://arstechnica.com'+a_author['href']

	# within div class news-item-figure, we need to grab images

	# Deal with multi-page stories
        self.append_page(soup, soup.body, 3)

        return soup

    def get_article_url(self, article):
	# If the article title starts with Etc:, don't return it
	if self.ignoreEtcArticles:
		article_title = article.get('title',None)
		if re.match('Etc: ',article_title) is not None:
			return None

	# The actual article is in a guid tag
        return article.get('guid',  None).rpartition('?')[0]