Read Dnevnik - mk on your iPad or Kindle in no time. Click download to load the free ebook on your reader.
Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.
Daily Macedonian newspaper
Language: mk
Requires Subscription: No, it's available as free ebook
Schedule Every morning
#!/usr/bin/env python
__author__ = 'Darko Spasovski'
__license__ = 'GPL v3'
__copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
dnevnik.com.mk
'''
import re
import datetime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Dnevnik(BasicNewsRecipe):
INDEX = 'http://www.dnevnik.com.mk'
__author__ = 'Darko Spasovski'
title = 'Dnevnik - mk'
description = 'Daily Macedonian newspaper'
masthead_url = 'http://www.dnevnik.com.mk/images/re-logo.gif'
language = 'mk'
publication_type = 'newspaper'
category = 'news, Macedonia'
max_articles_per_feed = 100
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
## Remove anything before the start of the article.
(r'<body.*?<\?xml version=\"1.0\"\?><!--Article start-->', lambda match: '<body>'),
## Remove anything after the end of the article.
(r'<!--Article end.*?</body>', lambda match : '</body>'),
]
]
extra_css = """
body{font-family: Arial,Helvetica,sans-serif}
.WB_DNEVNIK_Naslov{FONT-WEIGHT: bold; FONT-SIZE: 18px; FONT-FAMILY: Arial, Verdana, Tahoma; TEXT-DECORATION: none}
"""
conversion_options = {
'comment' : description,
'tags' : category,
'language' : language,
'linearize_tables' : True
}
def parse_index(self):
datum = datetime.datetime.today().strftime('%d.%m.%Y')
soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
feeds = []
for section in soup.findAll('td', attrs={'class':'WB_DNEVNIK_ArhivaFormTitle'}):
sectionTitle = section.contents[0].string
if sectionTitle.lower().startswith('online'):
# Skip online articles
continue
containerTable = section.findPrevious(name='table').findNextSibling(name='table')
if containerTable==None:
print 'No container table found - page layout may have been changed.'
continue
articles = []
for article in containerTable.findAll('a', attrs={'class': 'WB_DNEVNIK_ArhivaFormText'}):
title = self.tag_to_string(article, use_alt=True).strip()
articles.append({'title': title, 'url':'http://www.dnevnik.com.mk/' + article['href'], 'description':'', 'date':''})
if articles:
feeds.append((sectionTitle, articles))
return sorted(feeds, key=lambda section: self.get_weight(section))
def get_weight(self, section):
"""
Returns 'weight' of a section.
Used for sorting the sections based on their 'natural' order in the printed edition.
"""
natural_order = { u'во фокусот': 1, u'актуелно': 2, u'економија': 3,
u'отворена': 4, u'свет': 5, u'интервју': 6, u'џубокс': 7,
u'репортажа': 8, u'наш туризам': 9, u'живот': 10,
u'автомобилизам': 11, u'спорт': 12, u'омнибус': 13 }
if section[0].string.lower() in natural_order:
return natural_order[section[0].string.lower()]
else:
return 999 # section names not on the list go to the bottom
def get_cover_url(self):
datum = datetime.datetime.today().strftime('%d.%m.%Y')
soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
anchor = soup.find('a', attrs={'class': 'WB_DNEVNIK_MoreLink'})
if anchor != None:
raw = browser().open_novisit(self.INDEX + '/' + anchor['href']).read()
cover_soup = BeautifulSoup(raw)
url = cover_soup.find('div', attrs={'class':'WB_DNEVNIK_Datum2'}).findNext('img')['src']
return self.INDEX + '/' + url
return ''