Read Dnevnik - mk on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Daily Macedonian newspaper

Language: mk

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  #!/usr/bin/env  python

__author__    = 'Darko Spasovski'
__license__   = 'GPL v3'
__copyright__ = '2011, Darko Spasovski <darko.spasovski at gmail.com>'
'''
dnevnik.com.mk
'''

import re
import datetime
from calibre.web.feeds.news import BasicNewsRecipe
from calibre import browser
from calibre.ebooks.BeautifulSoup import BeautifulSoup

class Dnevnik(BasicNewsRecipe):

    INDEX                 = 'http://www.dnevnik.com.mk'
    __author__ = 'Darko Spasovski'
    title                 = 'Dnevnik - mk'
    description           = 'Daily Macedonian newspaper'
    masthead_url          = 'http://www.dnevnik.com.mk/images/re-logo.gif'
    language              = 'mk'
    publication_type      = 'newspaper'
    category              = 'news, Macedonia'
    max_articles_per_feed = 100
    remove_javascript     = True
    no_stylesheets        = True
    use_embedded_content  = False

    preprocess_regexps = [(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
    [
        ## Remove anything before the start of the article.
        (r'<body.*?<\?xml version=\"1.0\"\?><!--Article start-->', lambda match: '<body>'),

        ## Remove anything after the end of the article.
        (r'<!--Article end.*?</body>', lambda match : '</body>'),
        ]
    ]

    extra_css = """
                    body{font-family: Arial,Helvetica,sans-serif}
                    .WB_DNEVNIK_Naslov{FONT-WEIGHT: bold; FONT-SIZE: 18px; FONT-FAMILY: Arial, Verdana, Tahoma; TEXT-DECORATION: none}
                """

    conversion_options = {
                          'comment'  : description,
                          'tags'     : category,
                          'language' : language,
                          'linearize_tables' : True
                        }

    def parse_index(self):
        datum = datetime.datetime.today().strftime('%d.%m.%Y')
        soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
        feeds = []
        for section in soup.findAll('td', attrs={'class':'WB_DNEVNIK_ArhivaFormTitle'}):
            sectionTitle = section.contents[0].string
            if sectionTitle.lower().startswith('online'):
                # Skip online articles
                continue
            containerTable = section.findPrevious(name='table').findNextSibling(name='table')
            if containerTable==None:
                print 'No container table found - page layout may have been changed.'
                continue
            articles = []
            for article in containerTable.findAll('a', attrs={'class': 'WB_DNEVNIK_ArhivaFormText'}):
                title = self.tag_to_string(article, use_alt=True).strip()
                articles.append({'title': title, 'url':'http://www.dnevnik.com.mk/' + article['href'], 'description':'', 'date':''})
            if articles:
                feeds.append((sectionTitle, articles))
        return sorted(feeds, key=lambda section: self.get_weight(section))

    def get_weight(self, section):
        """
        Returns 'weight' of a section.
        Used for sorting the sections based on their 'natural' order in the printed edition.
        """
        natural_order = { u'во фокусот': 1, u'актуелно': 2, u'економија': 3,
                          u'отворена': 4, u'свет': 5, u'интервју': 6, u'џубокс': 7,
                          u'репортажа': 8, u'наш туризам': 9, u'живот': 10,
                          u'автомобилизам': 11, u'спорт': 12, u'омнибус': 13 }
        if section[0].string.lower() in natural_order:
            return natural_order[section[0].string.lower()]
        else:
            return 999  # section names not on the list go to the bottom

    def get_cover_url(self):
        datum = datetime.datetime.today().strftime('%d.%m.%Y')
        soup = self.index_to_soup(self.INDEX + '/default.asp?section=arhiva&arhDatum=' + datum)
        anchor = soup.find('a', attrs={'class': 'WB_DNEVNIK_MoreLink'})
        if anchor != None:
            raw = browser().open_novisit(self.INDEX + '/' + anchor['href']).read()
            cover_soup = BeautifulSoup(raw)
            url = cover_soup.find('div', attrs={'class':'WB_DNEVNIK_Datum2'}).findNext('img')['src']
            return self.INDEX + '/' + url
        return ''