Web Scrapping: Finding Necessary Contents from a Medium Dot Com Blog Post
Sometime we need to to scrap contents of an article from medium blog to create a dataset or analyze data.
Here, in this post, I will provide necessary code to scrap different contents of a blog post from Medium.
Necessary Modules
Let’s first install the following modules. Beautifulsoup
and requests
are our primary modules, which are popular for this type of task. Then we need json
and re
for some little tuning tasks.
from bs4 import BeautifulSoup
import requests
import json
import re
Scrapping Specific Content
Here is a class that includes method for collecting following information:
- post title
- author name
- author link
- publish date
- post tags
- readtime
- no. of claps
- no. of voters
- no. of post responses
class PostDetais(object):
def __init__(self, soup, link=None):
self.page_soup = soup
self.link = link
# returns the basic JSON response
# collection target -> title, author infos, publication dates, tags, post links etc
def json_response_basic(self):
try:
for my_tag in self.page_soup.find_all('script', {
'type': "application/ld+json"}):
res = my_tag.text
return res
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_response"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
# returns a larger json response with rigorous information
# collection target -> read time, upvotes, responses etc
def json_response_whole(self):
try:
# [source: https://stackoverflow.com/questions/13323976/how-to-extract-a-json-object-that-was-defined-in-a-html-page-javascript-block-us
# script = re.compile('window.__APOLLO_STATE__ = ({.*})', re.DOTALL)
# json_text = script.search(str(self.page_soup)]
# following code of mine looks simpler :D
for tag in self.page_soup.find_all('script'):
if 'window.__APOLLO_STATE__' in tag.text:
data = tag.text
return data[26:]
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_response"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
# find particular first key that is required in the larger json response
# we're using regex as post number will be different for each post
# we know that the post key starts with "Post:"
def find_first_key(self, json_data):
find_key_string = [re.compile("^Post:*").match]
required_key = [k for k,v in json_data.items()
if any (item(k) for item in find_key_string)]
return required_key[0]
# return post title using h1 tag
# it could be collected from the basic json response as well
def get_title(self):
try:
for my_tag in self.page_soup.find_all('h1'):
title = my_tag.text
return title
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_title"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
# returns two values: name and url
def get_author_name(self, json_data):
try:
author_name = json_data['author']['name']
author_url = json_data['author']['url']
return author_name, author_url
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_title"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
# return 3 dates for a post- creation, publication, & last modification date
def get_date(self, json_data):
try:
creation_date = json_data['dateCreated']
published_date = json_data['datePublished']
last_modification_date = json_data['dateModified']
return creation_date, published_date, last_modification_date
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_title"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
def get_tags(self, key, json_data):
try:
tags = []
for item in json_data[key]['tags']:
tags.append(item["__ref"][4:])
return tags
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_tags"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
def get_read(self, key, json_data):
try:
read = json_data[key]['readingTime']
return read
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_title"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
def get_upvote(self, key, json_data):
try:
clap_count = json_data[key]['clapCount']
voter_count = json_data[key]['voterCount']
return clap_count, voter_count
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_upvote"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
def get_post_content(self):
try:
# class_="section-content"
# we are using <article> tag as section-content class no longer works
# everything is inside the article tag including title & author names
for content in self.page_soup.find_all('article'):
return content.text
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_post_content"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
def get_response(self, key, json_data):
try:
# res = json_data[key]['responsesCount']
res = json_data[key]['postResponses']
return res["count"]
except Exception as e:
error_trace = {}
error_trace["link"] = self.link
error_trace["method"] = "get_response"
error_trace["message"] = str(e)
print(json.dumps(error_trace, indent=4))
return ""
Download Page Source
Let’s collect the page contents using the requests
module. Then we need to parse it using Beautifulsoup
HTML parser.
Since, Medium is a dynamic site and you just simply cannot scrap contents using class
elements. That’s why I found a new way to do it: getting the JSON stored within the source. So, here, I just separately, store the JSON contents.
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
})
link = "https://medium.com/startup-grind/i-was-wrong-about-ethereum-804c9a906d36"
request_link = requests.get(link, headers=headers)
#request_link = urllib.request.urlopen(link, headers=headers)
request_content = BeautifulSoup(request_link.content,'html.parser')
post_details = PostDetais(request_content, link)
json_basic_script = json.loads(post_details.json_response_basic())
json_full_script = json.loads(post_details.json_response_whole())
first_key_element = post_details.find_first_key(json_full_script)
Storing each content
Now, let’s just call each method from our class and store corresponding contents in separate variables.
post_title = post_details.get_title()
author_name, author_link = post_details.get_author_name(json_basic_script)
creation_date, published_date, modified_date = post_details.get_date(json_basic_script)
post_tags = post_details.get_tags(first_key_element, json_full_script)
post_readtime = post_details.get_read(first_key_element, json_full_script)
post_claps, post_voters = post_details.get_upvote(first_key_element, json_full_script)
post_contents = post_details.get_post_content()
post_responses = post_details.get_response(first_key_element, json_full_script)
Checking the Output
If you print all the variables now, we will get the corresponding outputs as follows:
print(post_title)
print(author_name)
print(author_link)
print(published_date[:10])
print(post_tags)
print((str(post_readtime))[:4])
print(post_claps)
print(post_voters)
print(post_responses)
I was wrong about Ethereum
WhalePanda
[https://medium.com/@WhalePanda](https://medium.com/@WhalePanda)
2017-06-12
['blockchain', 'bitcoin', 'ethereum', 'fintech', 'finance']
8.61
3980
1718
131
That’s all for today. Chill!!! 😎
Leave a comment