#!/usr/bin/python3 import requests import pandas as pd import re from bs4 import BeautifulSoup from datetime import date def scrape(url, meta, data): #--------[ Get Page From URL ]--------# headers = { 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Methods': 'GET', 'Access-Control-Allow-Headers': 'Content-Type', 'Access-Control-Max-Age': '3600', 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0' } page = requests.get(url, headers) soup = BeautifulSoup(page.content, 'html.parser') #--------[ Get Metadata ]--------# meta['name'] = soup.find('span', class_='mw-page-title-main').text meta['description'] = re.sub('\[.*?\]', '', soup.select('p')[0].getText().strip()).replace('\n',' ') meta['sources'].append(url) #--------[ Extract Table ]--------# table = soup.find('table', class_='wikitable sortable') # Get Table Headings for tr in table.find_all('tr'): row = [ th.text.strip() for th in tr.find_all('th')] if len(row) > 1: data.append( row ) # Get Table Data for tr in table.find_all('tr'): row = [ td.text.strip() for td in tr.find_all('td')] if len(row) > 1: data.append( row ) #--------[ Process Table ]-------- # Delete rows with incorrect number of variables key = 0 key_len = len(data) while key < key_len: if len(data[key]) != len(data[0]): data.pop(key) key = key-1 key = key+1 key_len = len(data) # Delete unwanted table columns key = 0 key_len = len(data[0]) while key < key_len: flag = False if data[0][key].lower().find('rank') >=0: flag = True if data[0][key].lower().find('change') >=0: flag = True if data[0][key].lower().find('notes') >=0: flag = True if data[0][key].lower().find('gap') >=0: flag = True if data[0][key].lower().find('Δ') >=0: flag = True if flag: for i in range(0, len(data)): data[i].pop(key) key = key-1 key = key+1 key_len = len(data[0])