Read iHNed on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

Zprávy z iHNed.cz

Language: cs

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  import re, time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe

class IHNed(BasicNewsRecipe):


    stahnout_vsechny = True
        #True   = stahuje vsechny z homepage
        #False  = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)

    title       = 'iHNed'
    __author__  = 'Karel Bílek'
    language = 'cs'
    description = 'Zprávy z iHNed.cz'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = False
    remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
                 dict(style=['text-align: center;']),
                 dict(id=['r-bfull']),
                 dict(name=['script', 'noscript', 'style'])]
    encoding = 'windows-1250'
    no_stylesheets = True
    remove_tags_before = dict(attrs={'class':'d-nadtit'})
    remove_tags_after = dict(attrs={'class':'like'})

    conversion_options = {
      'linearize_tables' : True,
    }



    def preprocess_html(self, soup):

        def makeurl(wat):
            return "http://ihned.cz"+wat;

        for h1 in soup.findAll('h1'):
             a = h1.find('a')
             if a:
                 string = a.string
                 if string:
                     soup.a.replaceWith(string)
        for a in soup.findAll('a',  href=True) :
            cil = str(a['href'])
            if cil.startswith("/") or  cil.startswith("index"):
                a['href'] = makeurl(cil)
        return soup


    def parse_index(self):

        def makeurl(wat):
            if wat.startswith("/") or  wat.startswith("index"):
                return "http://ihned.cz"+wat;
            else:
                return wat


        articles = {} #vysledek, asi
        key = None #soucasna sekce
        ans = [] #vsechny sekce

        articles["Hlavní"] = []
        ans.append("Hlavní")

        was = {}

        def parse_subpage(url, name):
            articles[name] = []
            ans.append(name)


            soup = self.index_to_soup(url)
            otvirak = soup.find(True, attrs={'class':['otv']})
            if otvirak:

                #the code is copypasted here because I don't know python. simple as that.
                a = otvirak.find('a', href=True)
                title = self.tag_to_string(a, use_alt=True).strip()
                txt = otvirak.find(True, attrs={'class':['txt']})
                description = ''
                if txt:
                    match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
                    if match:
                        description = match.group(1)

                pubdate = strftime('%d. %m.')
                if not title in was:
                    articles[name].append(
                          dict(title=title, url=makeurl(a['href']), date=pubdate,
                                description=description,
                                content=''))

            otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
            if otv234:
                for ow in otv234.findAll(True, attrs={'class':['ow']}):
                    a = ow.find('a', href=True)
                    title = self.tag_to_string(a, use_alt=True).strip()
                    description=''
                    prx = ow.find(True, attrs={'class':['prx']});
                    if prx:
                        description = str(prx.string)
                    nfo = ow.find(True, attrs={'class':['nfo']});
                    pubdate = ''
                    if nfo:
                        dtime = time.localtime();
                        day = dtime[2]
                        month = dtime[1]

                        pubdate = strftime('%d. %m.')

                        match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))

                        if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
                            if not title in was:
                                articles[name].append(
                                      dict(title=title, url=makeurl(a['href']), date=pubdate,
                                            description=description,
                                            content=''))






        soup = self.index_to_soup('http://ihned.cz/')
        otvirak = soup.find(True, attrs={'class':['otv']})
        if otvirak:
            a = otvirak.find('a', href=True)
            title = self.tag_to_string(a, use_alt=True).strip()
            txt = otvirak.find(True, attrs={'class':['txt']})
            description = ''
            if txt:
                match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
                if match:
                    description = match.group(1)

            pubdate = strftime('%d. %m.')
            feed = "Hlavní"
            articles[feed].append(
                      dict(title=title, url=(a['href']), date=pubdate,
                            description=description,
                            content=''))
            was[title]=1

        otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
        if otvirak2345:
            for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
                a = otv2.find('a', attrs={'class':['tit2']}, href=True)
                title = self.tag_to_string(a, use_alt=True).strip()
                description=''
                span = otv2.find('span');
                if span:
                    match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
                    if match:
                        description = match.group(1)
                feed = "Hlavní"
                pubdate = strftime('%d. %m.')
                articles[feed].append(
                          dict(title=title, url=(a['href']), date=pubdate,
                                description=description,
                                content=''))
                was[title]=1


        parse_subpage("http://komentare.ihned.cz/", "Komentáře")
        parse_subpage("http://domaci.ihned.cz", "Domácí")
        parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
        parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
        parse_subpage("http://finweb.ihned.cz/", "Finance");
        parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
        parse_subpage("http://kultura.ihned.cz/", "Kultura")
        parse_subpage("http://sport.ihned.cz/", "Sport");

        #seradi kategorie
        ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})

        #vrati, ale pouze, kdyz je v kategoriich...
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        return ans