Read Glenn Beck on your iPad or Kindle in no time. Click download to load the free ebook on your reader.

Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.

Download for free »

The fusion of entertainment and enlightenment

Language: en

Requires Subscription: No, it's available as free ebook

Schedule Every morning

			  from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, Comment

class GlennBeckRecipe(BasicNewsRecipe):
    __license__   = 'GPL v3'
    __author__ = 'kwetal'
    language = 'en'
    version = 1

    title = u'Glenn Beck'
    publisher = u'Premiere Radio Networks'
    category = u'News, Opinion'
    description = u'The fusion of entertainment and enlightenment'

    oldest_article = 7
    max_articles_per_feed = 100

    no_stylesheets = True
    remove_javascript = True
    use_embedded_content = False

    feeds = [(u'Glenn Beck', u'http://feeds.feedburner.com/GlennBeckArticles')]

    def preprocess_html(self, soup):
        # Their html is horribly broken; if we search for the div that has the content BeatifulSoup returns the div with only the headline and no content.
        # This is due to illegal nesting of tags. So we do it the hard way.

        # We can find this one, and we don't want it.
        div = soup.find('div', attrs = {'id': 'extraInfo'})
        if div:
            div.extract()

        # Don't want these either.
        iframes = soup.findAll('iframe')
        [iframe.extract() for iframe in iframes]

        # Get empty document.
        freshSoup = self.getFreshSoup()

        # This is the broken div; but we can find the headline.
        newsDiv = soup.find('div', attrs = {'class': 'news-detail'})
        if newsDiv:
            if newsDiv.h1:
                freshSoup.body.append(newsDiv.h1)

        # The content is  wrapped in <p></p> tags, most of the time anyway.
        counter = 0
        for p in soup.findAll('p'):
            if p.get('class') == 'smalltextwhite':
                # But we don't want this one.
                continue

            freshSoup.body.append(p)
            counter += 1

        # Debugging block
        #h3 = Tag(freshSoup, 'h3')
        #h3.append('First counter: ' + str(counter))
        #freshSoup.body.insert(0, h3)

        # In some articles the content is not wrapped in <p></p> tags. In that case the counter is low.
        # 2 is the magic number that seems to work.
        if counter <= 2:
            # So they are playing hard-to-get: first throw out all comments.
            comments = soup.findAll(text = lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]

            # Find all unwrapped strings.
            for txt in soup.findAll(text = True):
                raw = txt.strip()
                # Debugging line
                #para.append(raw + '(parent: ' + txt.parent.name + '; length: ' + str(len(raw)) + '; start: ' + raw[0:4] + ')')

                if (txt.parent.name == 'body' and len(raw) > 0) and not (len(raw) == 6 and raw == '&nbsp;'):
                    # This is our content; ignore the rest.
                    para = Tag(freshSoup, 'p')
                    para.append(raw)
                    freshSoup.body.append(para)
                    counter += 1

            # Now if the counter is still 0 or 1 they did something completely different and we still have an empty article. In a last attempt, add the whole content div, just in case.
            if counter < 2:
                freshSoup.body.append(newsDiv)

            # Debugging block
            #h3 = Tag(freshSoup, 'h3')
            #h3.append('Second counter: ' + str(counter))
            #freshSoup.body.insert(1, h3)

        return freshSoup

    def getFreshSoup(self, title = None):
        if title:
            return BeautifulSoup('<html><head><title>' + str(title) + '</title></head><body></body></html>')
        else:
            return BeautifulSoup('<html><head><title></title></head><body></body></html>')