masterscraper/masterscraper/wikipedia/__init__.py

80 lines
2.3 KiB
Python

#!/usr/bin/python3
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from datetime import date
def getpage(url):
#--------[ Get Page From URL ]--------#
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
page = requests.get(url, headers)
soup = BeautifulSoup(page.content, 'html.parser')
return soup
def scrape(self, url):
#--------[ Get Page From URL ]--------#
soup = getpage(url)
#--------[ Get Metadata ]--------#
self.meta['name'] = soup.find('span', class_='mw-page-title-main').text
self.meta['description'] = re.sub('\[.*?\]', '', soup.select('p')[0].getText().strip()).replace('\n',' ')
self.meta['sources'].append(url)
#--------[ Extract Table ]--------#
table = soup.find('table', class_='wikitable sortable')
# Get Table Headings
for tr in table.find_all('tr'):
row = [ th.text.strip() for th in tr.find_all('th')]
if len(row) > 1:
self.data.append( row )
# Get Table Data
for tr in table.find_all('tr'):
row = [ td.text.strip() for td in tr.find_all('td')]
if len(row) > 1:
self.data.append( row )
#--------[ Process Table ]--------
# Delete rows with incorrect number of variables
key = 0
key_len = len(self.data)
while key < key_len:
if len(self.data[key]) != len(self.data[0]):
self.data.pop(key)
key = key-1
key = key+1
key_len = len(self.data)
# Delete unwanted table columns
key = 0
key_len = len(self.data[0])
while key < key_len:
flag = False
if self.data[0][key].lower().find('rank') >=0: flag = True
if self.data[0][key].lower().find('change') >=0: flag = True
if self.data[0][key].lower().find('notes') >=0: flag = True
if self.data[0][key].lower().find('gap') >=0: flag = True
if self.data[0][key].lower().find('Δ') >=0: flag = True
if flag:
for i in range(0, len(self.data)):
self.data[i].pop(key)
key = key-1
key = key+1
key_len = len(self.data[0])