Read Newsweek Polska on your iPad or Kindle in no time. Click download to load the free ebook on your reader.
Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.
Weekly magazine
Language: pl
Requires Subscription: No, it's available as free ebook
Schedule Every morning
# -*- coding: utf-8 -*-
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com'
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
import datetime
class Newsweek(BasicNewsRecipe):
EDITION = '0'
DATE = None
YEAR = datetime.datetime.now().year
title = u'Newsweek Polska'
__author__ = 'matek09'
description = 'Weekly magazine'
encoding = 'utf-8'
language = 'pl'
remove_javascript = True
temp_files = []
articles_are_obfuscated = True
def get_obfuscated_article(self, url):
br = self.get_browser()
br.open(url)
source = br.response().read()
page = self.index_to_soup(source)
main_section = page.find(id='mainSection')
title = main_section.find('h1')
info = main_section.find('ul', attrs={'class' : 'articleInfo'})
authors = info.find('li').find('h4')
article = main_section.find('div', attrs={'id' : 'article'})
html = unicode(title) + unicode(authors) + unicode(article)
next = main_section.find('li', attrs={'class' : 'next'})
while next:
url = next.find('a')['href']
br.open(url)
source = br.response().read()
page = self.index_to_soup(source)
main_section = page.find(id='mainSection')
article = main_section.find('div', attrs={'id' : 'article'})
aside = article.find(id='articleAside')
if aside is not None:
aside.extract()
html = html + unicode(article)
next = main_section.find('li', attrs={'class' : 'next'})
self.temp_files.append(PersistentTemporaryFile('_temparse.html'))
self.temp_files[-1].write(html)
self.temp_files[-1].close()
return self.temp_files[-1].name
def is_full(self, issue_soup):
while True:
main_section = issue_soup.find(id='mainSection')
next = main_section.find('li', attrs={'class' : 'next'})
if len(main_section.findAll(attrs={'class' : 'locked'})) > 1:
return False
elif next is None:
return True
else:
issue_soup = self.index_to_soup(next.find('a')['href'])
def find_last_full_issue(self, archive_url):
archive_soup = self.index_to_soup(archive_url)
select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')):
self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
if self.is_full(issue_soup):
return
self.YEAR = self.YEAR - 1
self.find_last_full_issue(archive_url + ',' + str(self.YEAR))
def parse_index(self):
archive_url = 'http://www.newsweek.pl/wydania/archiwum'
self.find_last_full_issue(archive_url)
soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
main_section = soup.find(id='mainSection')
img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
self.cover_url = img['src']
feeds = []
articles = {}
sections = []
while True:
news_list = main_section.find('ul', attrs={'class' : 'newsList'})
for h2 in news_list.findAll('h2'):
article = self.create_article(h2)
category_div = h2.findNext('div', attrs={'class' : 'kategorie'})
section = self.tag_to_string(category_div)
if articles.has_key(section):
articles[section].append(article)
else:
articles[section] = [article]
sections.append(section)
next = main_section.find('li', attrs={'class' : 'next'})
if next is None:
break
soup = self.index_to_soup(next.find('a')['href'])
main_section = soup.find(id='mainSection')
for section in sections:
feeds.append((section, articles[section]))
return feeds
def create_article(self, h2):
article = {}
a = h2.find('a')
article['title'] = self.tag_to_string(a)
article['url'] = a['href']
article['date'] = self.DATE
desc = h2.findNext('p')
if desc is not None:
article['description'] = self.tag_to_string(desc)
else:
article['description'] = ''
return article