Read Newsweek Polska on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Weekly magazine

Language: pl

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  # -*- coding: utf-8 -*-
#!/usr/bin/env  python

__license__   = 'GPL v3'
__copyright__ = '2010, matek09, matek09@gmail.com'

from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ptempfile import PersistentTemporaryFile
import datetime


class Newsweek(BasicNewsRecipe):
	EDITION = '0'
	DATE = None
	YEAR = datetime.datetime.now().year

	title = u'Newsweek Polska'
	__author__ = 'matek09'
	description = 'Weekly magazine'
	encoding = 'utf-8'
	language = 'pl'
	remove_javascript = True

	temp_files = [] 
	articles_are_obfuscated = True 


	def get_obfuscated_article(self, url):
		br = self.get_browser()
		br.open(url)
		source = br.response().read()
		page = self.index_to_soup(source)

		main_section = page.find(id='mainSection')
		
		title = main_section.find('h1')
		info = main_section.find('ul', attrs={'class' : 'articleInfo'})
		authors = info.find('li').find('h4')
		article = main_section.find('div', attrs={'id' : 'article'})
		html =  unicode(title) + unicode(authors) + unicode(article)
		next = main_section.find('li', attrs={'class' : 'next'})
		
		while next:
			url = next.find('a')['href']
			br.open(url)
			source = br.response().read()
			page = self.index_to_soup(source)
			main_section = page.find(id='mainSection')
			article = main_section.find('div', attrs={'id' : 'article'})
			aside = article.find(id='articleAside')
			if aside is not None:
				aside.extract()
			html = html + unicode(article)
			next = main_section.find('li', attrs={'class' : 'next'})
		
		
		self.temp_files.append(PersistentTemporaryFile('_temparse.html')) 
		self.temp_files[-1].write(html) 
		self.temp_files[-1].close() 
		return self.temp_files[-1].name
		
	def is_full(self, issue_soup):
		while True:
			main_section = issue_soup.find(id='mainSection')
			next = main_section.find('li', attrs={'class' : 'next'})
			if len(main_section.findAll(attrs={'class' : 'locked'})) > 1:
				return False
			elif next is None:
				return True
			else:
				issue_soup = self.index_to_soup(next.find('a')['href'])

	def find_last_full_issue(self, archive_url):
		archive_soup = self.index_to_soup(archive_url)
		select = archive_soup.find('select', attrs={'id' : 'paper_issue_select'})
		for option in select.findAll(lambda tag: tag.name == 'option' and tag.has_key('value')):
			self.EDITION = option['value'].replace('http://www.newsweek.pl/wydania/','')
			issue_soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
			if self.is_full(issue_soup):
				return
				
		self.YEAR = self.YEAR - 1
		self.find_last_full_issue(archive_url + ',' + str(self.YEAR))
		
	def parse_index(self):
		archive_url = 'http://www.newsweek.pl/wydania/archiwum'
		self.find_last_full_issue(archive_url)
		soup = self.index_to_soup('http://www.newsweek.pl/wydania/' + self.EDITION)
		self.DATE = self.tag_to_string(soup.find('span', attrs={'class' : 'data'}))
		main_section = soup.find(id='mainSection')
		img = main_section.find(lambda tag: tag.name == 'img' and tag.has_key('alt') and tag.has_key('title'))
		self.cover_url = img['src']
		feeds = []
		articles = {}
		sections = []
		while True:
			news_list = main_section.find('ul', attrs={'class' : 'newsList'})
			for h2 in news_list.findAll('h2'):
				
				article = self.create_article(h2)
				category_div = h2.findNext('div', attrs={'class' : 'kategorie'})
				section = self.tag_to_string(category_div)
				if articles.has_key(section):
					articles[section].append(article)
				else:
					articles[section] = [article]
					sections.append(section)
				
			next = main_section.find('li', attrs={'class' : 'next'})
			if next is None:
				break
			soup = self.index_to_soup(next.find('a')['href'])
			main_section = soup.find(id='mainSection')
			
		for section in sections:
			feeds.append((section, articles[section]))
		return feeds

	def create_article(self, h2):
		article = {}
		a = h2.find('a')
		article['title'] = self.tag_to_string(a)
		article['url'] = a['href']
		article['date'] = self.DATE
		desc = h2.findNext('p')

		if desc is not None:
			article['description'] = self.tag_to_string(desc)
		else:
			article['description'] = ''
		return article