Read Ведомости on your iPad or Kindle in no time. Click download to load the free ebook on your reader.
Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.
Ежедневная деловая газета
Language: ru
Requires Subscription: No, it's available as free ebook
Schedule Every morning
#!/usr/bin/env python
u'''
Ведомости
'''
from calibre.web.feeds.feedparser import parse
from calibre.ebooks.BeautifulSoup import Tag
from calibre.web.feeds.news import BasicNewsRecipe
class VedomostiRecipe(BasicNewsRecipe):
title = u'Ведомости'
__author__ = 'Nikolai Kotchetkov'
publisher = 'vedomosti.ru'
category = 'press, Russia'
description = u'Ежедневная деловая газета'
oldest_article = 3
max_articles_per_feed = 100
masthead_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
cover_url = u'http://motorro.com/imgdir/logos/ved_logo_black2_cropped.gif'
#Add feed names if you want them to be sorted (feeds of this list appear first)
sortOrder = [u'_default', u'Первая полоса', u'Власть и деньги']
encoding = 'cp1251'
language = 'ru'
no_stylesheets = True
remove_javascript = True
recursions = 0
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='td', attrs={'class' : ['second_content']})]
remove_tags_after = [dict(name='div', attrs={'class' : 'article_text'})]
remove_tags = [dict(name='div', attrs={'class' : ['sep', 'choice', 'articleRightTbl']})]
feeds = [u'http://www.vedomosti.ru/newspaper/out/rss.xml']
#base URL for relative links
base_url = u'http://www.vedomosti.ru'
extra_css = 'h1 {font-size: 1.5em; margin: 0em 0em 0em 0em; text-align: center;}'\
'h2 {font-size: 1.0em; margin: 0em 0em 0em 0em;}'\
'h3 {font-size: 0.8em; margin: 0em 0em 0em 0em;}'\
'.article_date {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
'.article_authors {font-size: 0.5em; color: gray; font-family: monospace; text-align:right;}'\
'.article_img {width:100%; text-align: center; padding: 3px 3px 3px 3px;}'\
'.article_img_desc {width:100%; text-align: center; font-size: 0.5em; color: gray; font-family: monospace;}'\
'.article_desc {font-size: 1em; font-style:italic;}'
def parse_index(self):
try:
feedData = parse(self.feeds[0])
if not feedData:
raise NotImplementedError
self.log("parse_index: Feed loaded successfully.")
if feedData.feed.has_key('title'):
self.title = feedData.feed.title
self.log("parse_index: Title updated to: ", self.title)
if feedData.feed.has_key('description'):
self.description = feedData.feed.description
self.log("parse_index: Description updated to: ", self.description)
def get_virtual_feed_articles(feed):
if feeds.has_key(feed):
return feeds[feed][1]
self.log("Adding new feed: ", feed)
articles = []
feeds[feed] = (feed, articles)
return articles
feeds = {}
#Iterate feed items and distribute articles using tags
for item in feedData.entries:
link = item.get('link', '');
title = item.get('title', '');
if '' == link or '' == title:
continue
article = {'title':title, 'url':link, 'description':item.get('description', ''), 'date':item.get('date', ''), 'content':''};
if not item.has_key('tags'):
get_virtual_feed_articles('_default').append(article)
continue
for tag in item.tags:
addedToDefault = False
term = tag.get('term', '')
if '' == term:
if (not addedToDefault):
get_virtual_feed_articles('_default').append(article)
continue
get_virtual_feed_articles(term).append(article)
#Get feed list
#Select sorted feeds first of all
result = []
for feedName in self.sortOrder:
if (not feeds.has_key(feedName)): continue
result.append(feeds[feedName])
del feeds[feedName]
result = result + feeds.values()
return result
except Exception, err:
self.log(err)
raise NotImplementedError
def preprocess_html(self, soup):
return self.adeify_images(soup)
def postprocess_html(self, soup, first_fetch):
#self.log('Original: ', soup.prettify())
#Find article
contents = soup.find('div', {'class':['article_text']})
if not contents:
self.log('postprocess_html: article div not found!')
return soup
contents.extract()
#Find title
title = soup.find('h1')
if title:
contents.insert(0, title)
#Find article image
newstop = soup.find('div', {'class':['newstop']})
if newstop:
img = newstop.find('img')
if img:
imgDiv = Tag(soup, 'div')
imgDiv['class'] = 'article_img'
if img.has_key('width'):
del(img['width'])
if img.has_key('height'):
del(img['height'])
#find description
element = img.parent.nextSibling
img.extract()
imgDiv.insert(0, img)
while element:
if not isinstance(element, Tag):
continue
nextElement = element.nextSibling
if 'p' == element.name:
element.extract()
element['class'] = 'article_img_desc'
imgDiv.insert(len(imgDiv.contents), element)
element = nextElement
contents.insert(1, imgDiv)
#find article abstract
abstract = soup.find('p', {'class':['subhead']})
if abstract:
abstract['class'] = 'article_desc'
contents.insert(2, abstract)
#Find article authors
authorsDiv = soup.find('div', {'class':['autors']})
if authorsDiv:
authorsP = authorsDiv.find('p')
if authorsP:
authorsP['class'] = 'article_authors'
contents.insert(len(contents.contents), authorsP)
#Fix urls that use relative path
urls = contents.findAll('a');
if urls:
for url in urls:
if not url.has_key('href'):
continue
if '/' == url['href'][0]:
url['href'] = self.base_url + url['href']
body = soup.find('td', {'class':['second_content']})
if body:
body.replaceWith(contents)
self.log('Result: ', soup.prettify())
return soup