Read Folha de São Paulo on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Brazilian news from Folha de São Paulo

Language: pt

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  # -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import datetime, timedelta
from calibre.ebooks.BeautifulSoup import Tag,BeautifulSoup
from calibre.utils.magick import Image, PixelWand
from urllib2 import Request, urlopen, URLError

class FolhaOnline(BasicNewsRecipe):
    THUMBALIZR_API        = '' # ---->Get your at http://www.thumbalizr.com/ and put here
    LANGUAGE              = 'pt_br'
    language = 'pt'
    LANGHTM               = 'pt-br'
    ENCODING              = 'cp1252'
    ENCHTM                = 'iso-8859-1'
    directionhtm          = 'ltr'
    requires_version      = (0,7,47)
    news                  = True

    title                 = u'Folha de S\xE3o Paulo'
    __author__            = 'Euler Alves and Alex Mitrani'
    description           = u'Brazilian news from Folha de S\xE3o Paulo'
    publisher             = u'Folha de S\xE3o Paulo'
    category              = 'news, rss'

    oldest_article        = 4
    max_articles_per_feed = 100
    summary_length        = 1000

    remove_javascript     = True
    no_stylesheets        = True
    use_embedded_content  = False
    remove_empty_feeds    = True
    timefmt               = ' [%d %b %Y (%a)]'

    html2lrf_options      = [
                            '--comment', description
                            ,'--category', category
                            ,'--publisher', publisher
    ]

    html2epub_options     = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"'

    hoje                  = datetime.now()
    pubdate               = hoje.strftime('%a, %d %b')
    if hoje.hour<6:
        hoje = hoje-timedelta(days=1)
    CAPA                  = 'http://www1.folha.uol.com.br/fsp/images/cp'+hoje.strftime('%d%m%Y')+'.jpg'
    SCREENSHOT            = 'http://www1.folha.uol.com.br/'
    cover_margins         = (0,0,'white')
    masthead_url          = 'http://f.i.uol.com.br/fsp/furniture/images/lgo-fsp-430x50-ffffff.gif'

    keep_only_tags      = [dict(name='div', attrs={'id':'articleNew'})]
    remove_tags         = [
                        dict(name='div',
                            attrs={'id':[
                                'articleButton'
                                ,'bookmarklets'
                                ,'ad-180x150-1'
                                ,'contextualAdsArticle'
                                ,'articleEnd'
                                ,'articleComments'
                            ]})
                        ,dict(name='div',
                            attrs={'class':[
                            'openBox adslibraryArticle'
	       ,'toolbar'
                            ]})

                        ,dict(name='a')
                        ,dict(name='iframe')
                        ,dict(name='link')
                        ,dict(name='script')
	    ,dict(name='li')
    ]
    remove_tags_after = 	dict(name='div',attrs={'id':'articleEnd'})

    feeds = [
    (u'Em cima da hora', u'http://feeds.folha.uol.com.br/emcimadahora/rss091.xml')
    ,(u'Cotidiano', u'http://feeds.folha.uol.com.br/folha/cotidiano/rss091.xml')
    ,(u'Brasil', u'http://feeds.folha.uol.com.br/folha/brasil/rss091.xml')
    ,(u'Mundo', u'http://feeds.folha.uol.com.br/mundo/rss091.xml')
    ,(u'Poder', u'http://feeds.folha.uol.com.br/poder/rss091.xml')
    ,(u'Mercado', u'http://feeds.folha.uol.com.br/folha/dinheiro/rss091.xml')
    ,(u'Saber', u'http://feeds.folha.uol.com.br/folha/educacao/rss091.xml')
    ,(u'Tec', u'http://feeds.folha.uol.com.br/folha/informatica/rss091.xml')
    ,(u'Ilustrada', u'http://feeds.folha.uol.com.br/folha/ilustrada/rss091.xml')
    ,(u'Ambiente', u'http://feeds.folha.uol.com.br/ambiente/rss091.xml')
    ,(u'Bichos', u'http://feeds.folha.uol.com.br/bichos/rss091.xml')
    ,(u'Ci\xEAncia', u'http://feeds.folha.uol.com.br/ciencia/rss091.xml')
    ,(u'Equil\xEDbrio e Sa\xFAde', u'http://feeds.folha.uol.com.br/equilibrioesaude/rss091.xml')
    ,(u'Turismo', u'http://feeds.folha.uol.com.br/folha/turismo/rss091.xml')
    ,(u'Esporte', u'http://feeds.folha.uol.com.br/folha/esporte/rss091.xml')
    ,(u'Zapping', u'http://feeds.folha.uol.com.br/colunas/zapping/rss091.xml')
    ,(u'Cida Santos', u'http://feeds.folha.uol.com.br/colunas/cidasantos/rss091.xml')
    ,(u'Clóvis Rossi', u'http://feeds.folha.uol.com.br/colunas/clovisrossi/rss091.xml')
    ,(u'Eliane Cantanhêde', u'http://feeds.folha.uol.com.br/colunas/elianecantanhede/rss091.xml')
    ,(u'Fernando Canzian', u'http://feeds.folha.uol.com.br/colunas/fernandocanzian/rss091.xml')
    ,(u'Gilberto Dimenstein', u'http://feeds.folha.uol.com.br/colunas/gilbertodimenstein/rss091.xml')
    ,(u'Hélio Schwartsman', u'http://feeds.folha.uol.com.br/colunas/helioschwartsman/rss091.xml')
    ,(u'Humberto Luiz Peron', u'http://feeds.folha.uol.com.br/colunas/futebolnarede/rss091.xml')
    ,(u'João Pereira Coutinho', u'http://feeds.folha.uol.com.br/colunas/joaopereiracoutinho/rss091.xml')
    ,(u'José Antonio Ramalho', u'http://feeds.folha.uol.com.br/colunas/canalaberto/rss091.xml')
    ,(u'Kennedy Alencar', u'http://feeds.folha.uol.com.br/colunas/kennedyalencar/rss091.xml')
    ,(u'Luiz Caversan', u'http://feeds.folha.uol.com.br/colunas/luizcaversan/rss091.xml')
    ,(u'Luiz Rivoiro', u'http://feeds.folha.uol.com.br/colunas/paiepai/rss091.xml')
    ,(u'Marcelo Leite', u'http://feeds.folha.uol.com.br/colunas/marceloleite/rss091.xml')
    ,(u'Sérgio Malbergier', u'http://feeds.folha.uol.com.br/colunas/sergiomalbergier/rss091.xml')
    ,(u'Sylvia Colombo', u'http://feeds.folha.uol.com.br/colunas/sylviacolombo/rss091.xml')
    ,(u'Valdo Cruz', u'http://feeds.folha.uol.com.br/colunas/valdocruz/rss091.xml')
    ]


    conversion_options = {
    'title'            : title
    ,'comments'        : description
    ,'publisher'       : publisher
    ,'tags'            : category
    ,'language'        : LANGUAGE
    ,'linearize_tables': True
    }

    def preprocess_html(self, soup):
        for item in soup.findAll(style=True):
            del item['style']
        if not soup.find(attrs={'http-equiv':'Content-Language'}):
            meta0 = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.LANGHTM)])
            soup.head.insert(0,meta0)
        if not soup.find(attrs={'http-equiv':'Content-Type'}):
            meta1 = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset="+self.ENCHTM)])
            soup.head.insert(0,meta1)
        return soup

    def postprocess_html(self, soup, first):
        #process all the images. assumes that the new html has the correct path
        for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
            iurl = tag['src']
            img = Image()
            img.open(iurl)
            width, height = img.size
            print 'img is: ', iurl, 'width is: ', width, 'height is: ', height
            if img < 0:
                raise RuntimeError('Out of memory')
            pw = PixelWand()
            if( width > height and width > 590) :
                print 'Rotate image'
                img.rotate(pw, -90)
                img.save(iurl)
        return soup

    def get_cover_url(self):
        cover_url      = self.CAPA
        pedido         = Request(self.CAPA)
        pedido.add_header('User-agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; '+self.LANGHTM+'; userid='+self.THUMBALIZR_API+') Calibre/0.8.47 (like Gecko)')
        pedido.add_header('Accept-Charset',self.ENCHTM)
        pedido.add_header('Referer',self.SCREENSHOT)
        try:
            resposta   = urlopen(pedido)
            soup       = BeautifulSoup(resposta)
            cover_item = soup.find('body')
            if cover_item:
                cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
            return cover_url
        except URLError:
            cover_url='http://api.thumbalizr.com/?api_key='+self.THUMBALIZR_API+'&url='+self.SCREENSHOT+'&width=600&quality=90'
            return cover_url