119 lines
3.5 KiB
Python
119 lines
3.5 KiB
Python
#!/usr/bin/python3
|
|
|
|
import requests
|
|
import pandas as pd
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
from datetime import date
|
|
|
|
|
|
|
|
def getpage(url):
|
|
#--------[ Get Page From URL ]--------#
|
|
headers = {
|
|
'Access-Control-Allow-Origin': '*',
|
|
'Access-Control-Allow-Methods': 'GET',
|
|
'Access-Control-Allow-Headers': 'Content-Type',
|
|
'Access-Control-Max-Age': '3600',
|
|
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
|
|
}
|
|
page = requests.get(url, headers)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
return soup
|
|
|
|
|
|
|
|
def scrapelist():
|
|
soup = getpage('https://www.macrotrends.net/countries/topic-overview')
|
|
|
|
# Get URL list of global metrics
|
|
links = []
|
|
for table in soup.find_all('div', class_='col-xs-3'):
|
|
for link in table.find_all('a'):
|
|
links.append('https://www.macrotrends.net' + link['href'])
|
|
|
|
# Get full country list for each global metric
|
|
full_list = []
|
|
for link in links:
|
|
soup = getpage(link)
|
|
table = soup.find('div', class_='col-xs-12')
|
|
for url in table.find_all('a'):
|
|
full_list.append('https://www.macrotrends.net' + url['href'])
|
|
print(url['href'])
|
|
#break
|
|
|
|
print('\nScraping {0} self.datasets from MacroTrends\n'.format( len(full_list) ))
|
|
|
|
return full_list
|
|
|
|
|
|
|
|
|
|
def scrape(self, url):
|
|
#--------[ Get Page From URL ]--------#
|
|
soup = getpage(url)
|
|
|
|
#--------[ Get Metadata ]--------#
|
|
url_parts = url.split('/')
|
|
|
|
self.meta['name'] = url_parts[-1].replace('-',' ').title()
|
|
|
|
soup_desc = getpage( 'https://www.macrotrends.net/countries/ranking/' + url.split('/')[-1] )
|
|
self.meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text
|
|
|
|
self.meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text )
|
|
|
|
self.meta['sources'].append( url )
|
|
|
|
self.meta['scope'] = url_parts[-2].replace('-',' ').title()
|
|
|
|
self.meta['id'] = url_parts[-3].lower()
|
|
|
|
|
|
|
|
#--------[ Extract Table ]--------#
|
|
table = soup.find('div', class_='col-xs-6')
|
|
table = table.find('table', class_='historical_data_table')
|
|
|
|
# Get Table Headings
|
|
for tr in table.find_all('tr'):
|
|
row = [ th.text.strip() for th in tr.find_all('th')]
|
|
if len(row) > 1:
|
|
self.data.append( row )
|
|
|
|
# Get Table Data
|
|
for tr in table.find_all('tr'):
|
|
row = [ td.text.strip() for td in tr.find_all('td')]
|
|
if len(row) > 1:
|
|
self.data.append( row )
|
|
|
|
#--------[ Process Table ]--------
|
|
|
|
# Delete rows with incorrect number of variables
|
|
key = 0
|
|
key_len = len(self.data)
|
|
while key < key_len:
|
|
if len(self.data[key]) != len(self.data[0]):
|
|
self.data.pop(key)
|
|
key = key-1
|
|
key = key+1
|
|
key_len = len(self.data)
|
|
|
|
# Delete unwanted table columns
|
|
key = 0
|
|
key_len = len(self.data[0])
|
|
while key < key_len:
|
|
flag = False
|
|
if self.data[0][key].lower().find('rank') >=0: flag = True
|
|
if self.data[0][key].lower().find('change') >=0: flag = True
|
|
if self.data[0][key].lower().find('notes') >=0: flag = True
|
|
if self.data[0][key].lower().find('gap') >=0: flag = True
|
|
if self.data[0][key].lower().find('Δ') >=0: flag = True
|
|
if self.data[0][key].lower().find('growth') >=0: flag = True
|
|
if flag:
|
|
for i in range(0, len(self.data)):
|
|
self.data[i].pop(key)
|
|
key = key-1
|
|
key = key+1
|
|
key_len = len(self.data[0])
|