masterscraper/masterscraper/__init__.py

563 lines
20 KiB
Python

#!/usr/bin/python3
import pandas as pd
import requests
import re
import os
from bs4 import BeautifulSoup
from datetime import date
from . import wikipedia
from . import macrotrends
# Check If String Is Number
def isfloat(num):
try:
float(num)
return True
except:
return False
# Load URL Scrape List
def scrapelist(filename):
f = open(filename, 'r')
tmp_list = [l.strip() for l in f.readlines()]
f.close()
return(tmp_list)
class scrape:
#--------[ Global Variables ]--------#
meta = {} # Metadata
data = [] # Actual Data
data_info = [] # Variable information to split data table to seperate JSON files
#--------[ Scrape Constructor Object ]--------#
def __init__(self, url):
print('\n[{0}]'.format(url))
self.meta = {
"name" : None,
"description" : None,
"units" : None,
"year" : None,
"notes" : [],
"id" : None,
"type" : None,
"scope" : None,
"category" : None,
"subcategory" : None,
"tags" : [],
"authors" : [],
"sources" : []
}
self.data = []
self.data_info = []
if url.find('wikipedia.org') >=0:
wikipedia.scrape(url, self.meta, self.data)
if url.find('macrotrends.net') >= 0:
macrotrends.scrape(url, self.meta, self.data)
#--------[ Scrape Deconstructor ]--------#
def __del__(self):
pass
#--------[ Show Scrape Data ]--------#
def show(self):
print(self.meta)
for row in self.data:
print(row)
#--------[ Get Metadata ]--------#
def get_meta(self):
# Break if scrape contains no data
if len(self.data) <= 1: return(1)
# Process Name
self.meta['name'] = self.meta['name'].lower()
self.meta['name'] = re.sub('and\ dependencies ','',self.meta['name'])
self.meta['name'] = re.sub('list\ of\ ','',self.meta['name'])
self.meta['name'] = self.meta['name'].strip()
self.meta['name'] = self.meta['name'].title()
self.meta['name'] = self.meta['name'].replace('Gdp', 'GDP')
self.meta['name'] = self.meta['name'].replace('Gni', 'GNI')
self.meta['name'] = self.meta['name'].replace('Gnp', 'GNP')
# Get Key Names
self.data_info.append( [key for key in self.data[0]] )
# Process Key Names
key_name = []
for key in self.data_info[0]:
if(key.lower().find('country') >=0 or
key.lower().find('countries') >=0 or
key.lower().find('dependency') >=0 ):
key_name.append('country.name')
elif(key.lower().find('year') >=0):
key_name.append('year')
else:
tmp_key = key
tmp_key = tmp_key.lower()
tmp_key = re.sub('\[.*\]', '', tmp_key)
tmp_key = re.sub('\(.*\)', '', tmp_key)
tmp_key = re.sub('km2', '', tmp_key)
tmp_key = re.sub('\ in\ ', '', tmp_key)
tmp_key = re.sub('$', '', tmp_key)
tmp_key = re.sub('%', 'percent', tmp_key)
tmp_key = re.sub('and\ dependencies ','',tmp_key)
tmp_key = re.sub('list\ of\ countries\ by\ ','',tmp_key)
tmp_key = tmp_key.strip()
tmp_key = tmp_key.replace(' ','.')
if tmp_key != self.meta['name'].lower().replace(' ','.'):
tmp_key = self.meta['name'].lower().replace(' ','.') + '.' + tmp_key
key_name.append( tmp_key )
self.data_info.append( key_name )
# Process Unit Type
key_unit = []
for key in self.data_info[0]:
if( key.lower().find('percent') >=0 or
key.lower().find('perc') >=0 or
key.lower().find('%') >=0 ):
key_unit.append('%')
elif( key.lower().find('dollar') >=0 or
key.lower().find('$') >=0 ):
key_unit.append('$')
elif( key.lower().find('euro') >=0 or
key.lower().find('') >=0 ):
key_unit.append('')
elif( key.lower().find('area') >=0 or
key.lower().find('land') >=0 or
key.lower().find('km2') >=0 or
key.lower().find('km²') >=0 or
key.lower().find('mi2') >=0 or
key.lower().find('mi²') >=0 ):
key_unit.append('km²')
elif( key.lower().find('country') >=0 or
key.lower().find('countries') >=0 or
key.lower().find('dependencies') >=0 ):
key_unit.append('countries')
elif( key.lower().find('index') >=0 or
key.lower().find('score') >=0 or
key.lower().find('report') >=0 ):
key_unit.append('index')
elif( key.lower().find('population') >=0 and
key.lower().find('density') <0 ):
key_unit.append('people')
elif( key.lower().find('population') >=0 and
key.lower().find('density') >=0 ):
key_unit.append('people/km²')
elif( key.lower().find('death') >=0 or
key.lower().find('mortality') >=0 and
key.lower().find('rate') >=0 and
key.lower().find('infant') <0 and
key.lower().find('maternal') <0 ):
key_unit.append('deaths/1k population')
elif( key.lower().find('mortality') >=0 and
key.lower().find('rate') >=0 and
key.lower().find('infant') >=0 ):
key_unit.append('deaths/100k live births')
elif( key.lower().find('mortality') >=0 and
key.lower().find('rate') >=0 and
key.lower().find('maternal') >=0 ):
key_unit.append('deaths/1k live births')
elif( key.lower().find('birth') >=0 and
key.lower().find('rate') >=0 ):
key_unit.append('births/1k population')
elif( key.lower().find('fertility') >=0 and
key.lower().find('rate') >=0 ):
key_unit.append('children/women')
elif( key.lower().find('marriage') >=0 and
key.lower().find('rate') >=0 ):
key_unit.append('marriages/1k population')
elif( key.lower().find('divorce') >=0 and
key.lower().find('rate') >=0 ):
key_unit.append('divorces/1k population')
elif( key.lower().find('crime') >=0 and
key.lower().find('rate') >=0 ):
key_unit.append('crimes/100k population')
elif( key.lower().find('murder') >=0 and
key.lower().find('rate') >=0 ):
key_unit.append('murders/100k population')
elif( key.lower().find('military') >=0 and
key.lower().find('size') >=0 ):
key_unit.append('personel')
elif( key.lower().find('immigration') >=0 or
key.lower().find('migration') >=0 or
key.lower().find('refugee') >=0 and
key.lower().find('rate') <0 ):
key_unit.append('people')
elif( key.lower().find('emissions') >=0 ):
key_unit.append('tonnes')
else:
key_unit.append('unkown')
self.data_info.append( key_unit )
# Process Variable Multiplyer
key_multiplyer = []
for key in self.data_info[0]:
if( key.lower().find('%') >=0 ):
key_multiplyer.append( 0.01 )
else:
key_multiplyer.append( 1.0 )
self.data_info.append( key_multiplyer )
# Get Year
key_year = []
for key in self.data_info[0]:
if re.match('\d\d\d\d', key):
key_year.append( key )
elif 'year' in self.data_info[1]:
y1 = self.data[1][self.data_info[1].index('year')]
y2 = self.data[-1][self.data_info[1].index('year')]
if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) )
if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) )
else:
key_year.append( date.today().strftime('%Y') )
self.data_info.append( key_year )
# Get Type
for key in self.data_info[1]:
if key == 'country.name': self.meta['type'] = 'global'
elif key == 'year': self.meta['type'] = 'historical'
elif key == 'us.county.fips': self.meta['type'] = 'regional'
elif key == 'uk.constituency.name': self.meta['type'] = 'regional'
if self.meta['type'] == None: self.meta['type'] = 'unkown'
# Get Scope
key_scope = []
for key in self.data_info[1]:
if key.find('male') >=0: key_scope.append( 'male' )
elif key.find('female') >=0: key_scope.append( 'female' )
elif key.find('black') >=0: key_scope.append( 'black' )
elif key.find('white') >=0: key_scope.append( 'white' )
elif key.find('asian') >=0: key_scope.append( 'asian' )
elif key.find('native') >=0: key_scope.append( 'native' )
elif key.find('urban') >=0: key_scope.append( 'urban' )
elif key.find('rural') >=0: key_scope.append( 'rural' )
else: key_scope.append( self.meta['type'] )
self.data_info.append( key_scope )
# Get Category
search = self.meta['name'].join(self.data_info[0]).lower().strip()
#--------[ Geographic ]--------#
if( search.find('area') >=0 or
search.find('km2') >=0):
self.meta['category'] = 'geographic'
self.meta['subcategory'] = 'area'
#--------[ Demographic ]--------
elif( search.find('population') >=0 ):
self.meta['category'] = 'demogrpahic'
elif( search.find('birth') >=0 or
search.find('fertility') >=0 ):
self.meta['category'] = 'demogrpahic'
self.meta['subcategory'] = 'fertility'
#--------[ Health ]--------#
elif( search.find('life expectancy') >=0 or
search.find('death') >=0 or
search.find('suicide') >=0 or
search.find('mortality') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'mortality'
elif( search.find('depression') >=0 or
search.find('anxiety') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'psychology'
elif( search.find('smoking') >= 0 or
search.find('alcohol') >=0 ):
self.meta['category'] = 'health'
self.meta['subcategory'] = 'drugs'
#--------[ Economic ]--------#
elif( search.find('gdp') >=0 and
search.find('trade') <0 and
search.find('health') <0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'gdp'
elif( search.find('gni') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'gni'
elif( search.find('debt') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'debt'
elif( search.find('inflation') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'inflation'
elif( search.find('health') >=0 and
search.find('spend') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'welfare'
elif( search.find('manufature') >=0 or
search.find('business') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'business'
elif( search.find('import') >=0 or
search.find('export') >=0 or
search.find('invest') >=0 or
search.find('tarrif') >=0 or
search.find('trade') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'trade'
elif( search.find('unemployment') >=0 or
search.find('labor') >=0 ):
self.meta['category'] = 'economic'
self.meta['subcategory'] = 'labor-force'
#--------[ Education ]--------#
elif( search.find('education') >=0 or
search.find('literacy') >=0 ):
self.meta['category'] = 'education'
#--------[ Development ]--------#
elif( search.find('development') >=0 or
search.find('competitive') >=0 ):
self.meta['category'] = 'development'
#--------[ Crime ]--------#
elif( search.find('crime') >=0 or
search.find('homocide') >=0 or
search.find('murder') >=0 ):
self.meta['category'] = 'development'
#--------[ Crime ]--------#
elif( search.find('military') >=0 ):
self.meta['category'] = 'military'
#--------[ Uncategorised ]--------#
else:
self.meta['category'] = 'uncategorised'
# Get Tags
if not self.meta['type'] in self.meta['tags']: self.meta['tags'].append(self.meta['type'])
if not self.meta['category'] in self.meta['tags']: self.meta['tags'].append(self.meta['category'])
if not self.meta['subcategory'] in self.meta['tags']: self.meta['tags'].append(self.meta['subcategory'])
for scope in key_scope:
if not scope in self.meta['tags']: self.meta['tags'].append(scope)
#--------[ Clean Scrape Data ]--------#
def clean(self):
# Break if scrape contains no data
if len(self.data) <= 1: return(1)
for x in range(1, len(self.data)):
for y in range(0, len(self.data[x])):
self.data[x][y] = self.data[x][y]
# Remove any inline notes from data
if isinstance(self.data[x][y], str):
self.data[x][y] = re.sub('\[.*\]','', self.data[x][y])
self.data[x][y] = re.sub('\(.*\)','', self.data[x][y])
self.data[x][y] = re.sub(',','', self.data[x][y])
# Convert numerical strings to floats
if isinstance(self.data[x][y], str):
self.data[x][y] = self.data[x][y].strip()
if any(i.isdigit() for i in self.data[x][y]):
self.data[x][y] = ''.join([i for i in self.data[x][y] if i.isdigit() or i=='.'])
# Convert To Float
if isfloat(self.data[x][y]):
self.data[x][y] = float(self.data[x][y])
# Apply Variable Multiplyer
self.data[x][y] = self.data[x][y] * self.data_info[3][y]
# Convert Whole Floats To Integers
if self.data[x][y].is_integer():
self.data[x][y] = int(self.data[x][y])
# Convert non-entries to null
if isinstance(self.data[x][y], str):
if( self.data[x][y].lower().find('not determined') >= 0 or
self.data[x][y].lower().find('negligible') >=0 or
self.data[x][y].lower().find('negligible') >=0 or
self.data[x][y].lower().find('unkown') >= 0 ):
self.data[x][y] = None
if( self.data[x][y] == '-' or
self.data[x][y] == '' ):
self.data[x][y] = None
#--------[ Save Scrape Data ]--------#
def save(self):
# Break if scrape contains no data
if len(self.data) <= 1: return(1)
key_main = 0
for i in range(0, len(self.data_info[1])):
if( self.data[0][i] == 'country.name' >= 0 or
self.data[0][i] == 'year' >= 0 ):
key_main = i
for key_data in range(0, len(self.data[0])):
if key_data != key_main:
#--------[ Generate Filename ]--------#
filename = self.data_info[1][key_data].replace('.','-')
filepath = 'data/{0}'.format(self.meta['type'])
if self.meta['type'] == 'historical': filepath += '/' + self.meta['scope'].lower().replace(' ','-')
filepath += '/{0}'.format(self.meta['category'])
if self.meta['subcategory'] != None: filepath += '/' + self.meta['subcategory']
if len(self.data[0]) > 4:
filepath += '/' + self.meta['name'].lower().replace(' ','-')
fullpath = filepath + '/' + filename + '.json'
#--------[ Check File Directory ]--------#
if not os.path.exists(filepath):
os.makedirs(filepath)
#--------[ Open File ]--------#
f = open(fullpath, "w")
f.write('{\n')
#--------[ Update Metadata ]--------#
self.meta['units'] = self.data_info[2][key_data]
self.meta['year'] = self.data_info[4][key_data]
if self.meta['scope'] == None:
self.meta['scope'] = self.data_info[5][key_data]
#--------[ Write Metadata ]
f.write(' "metadata" : {\n')
for i in self.meta:
if isinstance(self.meta[i], str):
f.write(' "{0}" : "{1}"'.format( i, self.meta[i] ))
elif self.meta[i] == None:
f.write(' "{0}" : null'.format( i ))
elif isinstance(self.meta[i], list):
if len(self.meta[i]) <= 0:
f.write(' "{0}" : []'.format( i ))
elif i == 'tags':
f.write(' "{0}" : ['.format( i ))
for j in self.meta[i]:
f.write('"{0}"'.format( j ))
if j != self.meta[i][-1]: f.write(',')
f.write(']'.format( i ))
else:
f.write(' "{0}" : [\n'.format( i ))
for j in self.meta[i]:
f.write(' "{0}"'.format( j ))
if j != self.meta[i][-1]: f.write(',\n')
else: f.write('\n')
f.write(' ]'.format( i ))
if i != list(self.meta.keys())[-1]: f.write(',\n')
else: f.write('\n')
f.write(' },\n')
#--------[ Write Actual Data ]--------#
f.write(' "data" : [\n')
if self.meta['type'] == 'historical':
f.write(' ["{0}","{1}"],\n'.format(
self.data_info[1][key_main],
self.meta['id'] + '.' + self.data_info[1][key_data])
)
else:
f.write(' ["{0}","{1}"],\n'.format(
self.data_info[1][key_main],
self.data_info[1][key_data])
)
for row in self.data[1:]:
col_a = row[key_main]
col_b = row[key_data]
if isinstance(col_a, str): col_a = '"{0}"'.format(col_a)
if isinstance(col_b, str): col_b = '"{0}"'.format(col_b)
if col_a == None: col_a = 'null'
if col_b == None: col_b = 'null'
f.write(' [{0},{1}]'.format(col_a, col_b))
if row != self.data[-1]: f.write(',\n')
else: f.write('\n')
f.write(' ]\n')
#--------[ Final Result ]--------#
f.write('}\n')
f.close()
print(' [{0} data points] -> {1}'.format(len(self.data)-1, fullpath))