Read Ведомости on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Ежедневная деловая газета

Language: ru

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  #!/usr/bin/env  python

u'''
Ведомости
'''

from calibre.web.feeds.feedparser import parse
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe

class VedomostiRecipe(BasicNewsRecipe):
    title = u'Ведомости'
    __author__ = 'Nikolai Kotchetkov'
    publisher = 'vedomosti.ru'
    category = 'press, Russia'
    description = u'Ежедневная деловая газета'
    oldest_article = 3
    max_articles_per_feed = 100

    masthead_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
    cover_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'

    #Add feed names if you want them to be sorted (feeds of this list appear first)
    sortOrder = [u'_default', u'Первая полоса', u'Власть и деньги']

    encoding = 'cp1251'
    language = 'ru'
    no_stylesheets = True
    remove_javascript = True
    recursions = 0

    conversion_options = {
                          'comment'   : description
                        , 'tags'      : category
                        , 'publisher' : publisher
                        , 'language'  : language
                        }


    keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})]

    remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})]

    remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})]

    feeds = [u'http://www.vedomosti.ru/newspaper/out/rss.xml']

    #base URL for relative links
    base_url = u'http://www.vedomosti.ru'

    extra_css = 'h1 {font-size: 1.5em; margin: 0em 0em 0em 0em; text-align: center;}'\
                'h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;}'\
                'h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}'\
                '.article_date {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
                '.article_authors {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
                '.article_img {width:100%; text-align: center; padding: 3px 3px 3px 3px;}'\
                '.article_img_desc {width:100%; text-align: center; font-size: 0.5em; color: gray; font-family: monospace;}'\
                '.article_desc {font-size: 1em; font-style:italic;}'

    def parse_index(self):
        try:
            feedData = parse(self.feeds[0])
            if not feedData:
                raise NotImplementedError
            self.log("parse_index: Feed loaded successfully.")
            if feedData.feed.has_key('title'):
                self.title = feedData.feed.title
                self.log("parse_index: Title updated to: ", self.title)
            if feedData.feed.has_key('description'):
                self.description = feedData.feed.description
                self.log("parse_index: Description updated to: ", self.description)

            def get_virtual_feed_articles(feed):
                if feeds.has_key(feed):
                    return feeds[feed][1]
                self.log("Adding new feed: ", feed)
                articles = []
                feeds[feed] = (feed, articles)
                return articles

            feeds = {}

            #Iterate feed items and distribute articles using tags
            for item in feedData.entries:
                link = item.get('link', '');
                title = item.get('title', '');
                if '' == link or '' == title:
                    continue
                article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''};
                if not item.has_key('tags'):
                    get_virtual_feed_articles('_default').append(article)
                    continue
                for tag in item.tags:
                    addedToDefault = False
                    term = tag.get('term', '')
                    if '' == term:
                        if (not addedToDefault):
                            get_virtual_feed_articles('_default').append(article)
                        continue
                    get_virtual_feed_articles(term).append(article)

            #Get feed list
            #Select sorted feeds first of all
            result = []
            for feedName in self.sortOrder:
                if (not feeds.has_key(feedName)): continue
                result.append(feeds[feedName])
                del feeds[feedName]
            result = result + feeds.values()

            return result

        except Exception, err:
            self.log(err)
            raise NotImplementedError

    def preprocess_html(self, soup):
        return self.adeify_images(soup)

    def postprocess_html(self, soup, first_fetch):
        #self.log('Original: ', soup.prettify())

        #Find article
        contents = soup.find('div', {'class':['article_text']})
        if not contents:
            self.log('postprocess_html: article div not found!')
            return soup
        contents.extract()

        #Find title
        title = soup.find('h1')
        if title:
            contents.insert(0, title)

        #Find article image
        newstop = soup.find('div', {'class':['newstop']})
        if newstop:
            img = newstop.find('img')
            if img:
                imgDiv = Tag(soup, 'div')
                imgDiv['class'] = 'article_img'

                if img.has_key('width'):
                    del(img['width'])
                if img.has_key('height'):
                    del(img['height'])

                #find description
                element = img.parent.nextSibling

                img.extract()
                imgDiv.insert(0, img)

                while element:
                    if not isinstance(element, Tag):
                        continue
                    nextElement = element.nextSibling
                    if 'p' == element.name:
                        element.extract()
                        element['class'] = 'article_img_desc'
                        imgDiv.insert(len(imgDiv.contents), element)
                    element = nextElement

                contents.insert(1, imgDiv)

        #find article abstract
        abstract = soup.find('p', {'class':['subhead']})
        if abstract:
            abstract['class'] = 'article_desc'
            contents.insert(2, abstract)

        #Find article authors
        authorsDiv = soup.find('div', {'class':['autors']})
        if authorsDiv:
            authorsP = authorsDiv.find('p')
            if authorsP:
                authorsP['class'] = 'article_authors'
                contents.insert(len(contents.contents), authorsP)

        #Fix urls that use relative path
        urls = contents.findAll('a');
        if urls:
            for url in urls:
                if not url.has_key('href'):
                    continue
                if '/' == url['href'][0]:
                    url['href'] = self.base_url + url['href']

        body = soup.find('td', {'class':['second_content']})
        if body:
            body.replaceWith(contents)

        self.log('Result: ', soup.prettify())
        return soup