Read The Wall Street Journal on your iPad or Kindle in no time. Click download to load the free ebook on your reader.
Check out all the available public recipes or write your own with these quick start guides. ReadBeam is built on calibre, so everything in the docs and the fora applies here as well.
Print issue - paid content. The Wall Street Journal. is an American English-language international daily newspaper. The Journal primarily covers American economic and international business topics, and financial news and issues.
Language: en
Requires Subscription: Yes, requires a The Wall Street Journal subscription
Schedule Every morning
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
from calibre.web.feeds.news import BasicNewsRecipe
import copy
# http://online.wsj.com/page/us_in_todays_paper.html
class WallStreetJournal(BasicNewsRecipe):
title = 'The Wall Street Journal'
__author__ = 'Kovid Goyal, Sujata Raman, and Joshua Oster-Morris'
description = 'News and current affairs'
needs_subscription = True
language = 'en'
max_articles_per_feed = 1000
timefmt = ' [%a, %b %d, %Y]'
no_stylesheets = True
extra_css = '''h1{color:#093D72 ; font-size:large ; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; }
h2{color:#474537; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.subhead{color:gray; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small; font-style:italic;}
.insettipUnit {color:#666666; font-family:Arial,Sans-serif;font-size:xx-small }
.targetCaption{ font-size:x-small; color:#333333; font-family:Arial,Helvetica,sans-serif}
.article{font-family :Arial,Helvetica,sans-serif; font-size:x-small}
.tagline {color:#333333; font-size:xx-small}
.dateStamp {color:#666666; font-family:Arial,Helvetica,sans-serif}
h3{color:blue ;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
.byline{color:blue;font-family:Arial,Helvetica,sans-serif; font-size:xx-small}
h6{color:#333333; font-family:Georgia,"Century Schoolbook","Times New Roman",Times,serif; font-size:small;font-style:italic; }
.paperLocation{color:#666666; font-size:xx-small}'''
remove_tags_before = dict(name='h1')
remove_tags = [
dict(id=["articleTabs_tab_article", "articleTabs_tab_comments", "articleTabs_tab_interactive","articleTabs_tab_video","articleTabs_tab_map","articleTabs_tab_slideshow","articleTabs_tab_quotes","articleTabs_tab_document"]),
{'class':['footer_columns','network','insetCol3wide','interactive','video','slideshow','map','insettip','insetClose','more_in', "insetContent", 'articleTools_bottom', 'aTools', "tooltip", "adSummary", "nav-inline"]},
dict(rel='shortcut icon'),
{'class':lambda x: x and 'sTools' in x},
]
remove_tags_after = [dict(id="article_story_body"), {'class':"article story"},]
def get_browser(self):
br = BasicNewsRecipe.get_browser()
if self.username is not None and self.password is not None:
br.open('http://commerce.wsj.com/auth/login')
br.select_form(nr=1)
br['user'] = self.username
br['password'] = self.password
res = br.submit()
raw = res.read()
if 'Welcome,' not in raw and '>Logout<' not in raw:
raise ValueError('Failed to log in to wsj.com, check your '
'username and password')
return br
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def postprocess_html(self, soup, first):
for tag in soup.findAll(name=['table', 'tr', 'td']):
tag.name = 'div'
for tag in soup.findAll('div', dict(id=["articleThumbnail_1", "articleThumbnail_2", "articleThumbnail_3", "articleThumbnail_4", "articleThumbnail_5", "articleThumbnail_6", "articleThumbnail_7"])):
tag.extract()
return soup
def wsj_get_index(self):
return self.index_to_soup('http://online.wsj.com/itp')
def wsj_add_feed(self,feeds,title,url):
self.log('Found section:', title)
try:
if url.endswith('whatsnews'):
articles = self.wsj_find_wn_articles(url)
else:
articles = self.wsj_find_articles(url)
except:
articles = []
if articles:
feeds.append((title, articles))
return feeds
def abs_wsj_url(self, href):
if not href.startswith('http'):
href = 'http://online.wsj.com' + href
return href
def parse_index(self):
soup = self.wsj_get_index()
date = soup.find('span', attrs={'class':'date-date'})
if date is not None:
self.timefmt = ' [%s]'%self.tag_to_string(date)
cov = soup.find('div', attrs={'class':'itpSectionHeaderPdf'})
if cov is not None:
a = cov.find('a', href=True)
if a is not None:
self.cover_url = a['href']
feeds = []
div = soup.find('div', attrs={'class':'itpHeader'})
div = div.find('ul', attrs={'class':'tab'})
for a in div.findAll('a', href=lambda x: x and '/itp/' in x):
pageone = a['href'].endswith('pageone')
if pageone:
title = 'Front Section'
url = self.abs_wsj_url(a['href'])
feeds = self.wsj_add_feed(feeds,title,url)
title = "What's News"
url = url.replace('pageone','whatsnews')
feeds = self.wsj_add_feed(feeds,title,url)
else:
title = self.tag_to_string(a)
url = self.abs_wsj_url(a['href'])
feeds = self.wsj_add_feed(feeds,title,url)
return feeds
def wsj_find_wn_articles(self, url):
soup = self.index_to_soup(url)
articles = []
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
for a in whats_news.findAll('a', href=lambda x: x and '/article/' in x):
container = a.findParent(['p'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
title = self.tag_to_string(a).strip()
url = a['href']
desc = ''
if container is not None:
desc = self.tag_to_string(container)
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound WN article:', title)
self.log('\t\t', desc)
return articles
def wsj_find_articles(self, url):
soup = self.index_to_soup(url)
whats_news = soup.find('div', attrs={'class':lambda x: x and 'whatsNews-simple' in x})
if whats_news is not None:
whats_news.extract()
articles = []
flavorarea = soup.find('div', attrs={'class':lambda x: x and 'ahed' in x})
if flavorarea is not None:
flavorstory = flavorarea.find('a', href=lambda x: x and x.startswith('/article'))
if flavorstory is not None:
flavorstory['class'] = 'mjLinkItem'
metapage = soup.find('span', attrs={'class':lambda x: x and 'meta_sectionName' in x})
if metapage is not None:
flavorstory.append( copy.copy(metapage) ) #metapage should always be A1 because that should be first on the page
for a in soup.findAll('a', attrs={'class':'mjLinkItem'}, href=True):
container = a.findParent(['li', 'div'])
meta = a.find(attrs={'class':'meta_sectionName'})
if meta is not None:
meta.extract()
meta = self.tag_to_string(meta).strip()
if meta:
title = self.tag_to_string(a).strip() + ' [%s]'%meta
else:
title = self.tag_to_string(a).strip()
url = self.abs_wsj_url(a['href'])
desc = ''
for p in container.findAll('p'):
desc = self.tag_to_string(p)
if not 'Subscriber Content' in desc:
break
articles.append({'title':title, 'url':url,
'description':desc, 'date':''})
self.log('\tFound article:', title)
self.log('\t\t', desc)
return articles
def cleanup(self):
self.browser.open('http://online.wsj.com/logout?url=http://online.wsj.com')