diff --git a/masterscraper/__init__.py b/masterscraper/__init__.py index 9a2f453..79453c7 100644 --- a/masterscraper/__init__.py +++ b/masterscraper/__init__.py @@ -1,44 +1,26 @@ #!/usr/bin/python3 - -import pandas as pd -import requests -import re -import os - -from bs4 import BeautifulSoup -from datetime import date - -from . import wikipedia -from . import macrotrends - - - -# Check If String Is Number -def isfloat(num): - try: - float(num) - return True - except: - return False - - - -# Load URL Scrape List -def scrapelist(filename): - f = open(filename, 'r') - tmp_list = [l.strip() for l in f.readlines()] - f.close() - return(tmp_list) - - - class scrape: - #--------[ Global Variables ]--------# - meta = {} # Metadata - data = [] # Actual Data - data_info = [] # Variable information to split data table to seperate JSON files + #--------[ Import Module Parts ]--------# + from . import wikipedia + from . import macrotrends + + from .core.meta_name import meta_name + from .core.meta_search import meta_search + from .core.meta_keys import meta_keys + from .core.meta_year import meta_year + from .core.meta_units import meta_units + from .core.meta_multiplyer import meta_multiplyer + from .core.meta_scope import meta_scope + from .core.meta_category import meta_category + from .core.meta_type import meta_type + from .core.meta_tags import meta_tags + + from .core.get_list import get_list + from .core.show import show + from .core.clean import clean + from .core.save import save #--------[ Scrape Constructor Object ]--------# @@ -47,28 +29,25 @@ class scrape: print('\n[{0}]'.format(url)) self.meta = { - "name" : None, - "description" : None, - "units" : None, - "year" : None, - "notes" : [], - "id" : None, - "type" : None, - "scope" : None, - "category" : None, - "subcategory" : None, - "tags" : [], - "authors" : [], - "sources" : [] + "name" : None, # Variable/Set name + "description" : None, # Description of variable/set + "units" : None, # Units of variable + "year" : None, # Year(s) of variable + "notes" : [], # Any notes related to the variable/set + "id" : None, # Official ID of applicable + "type" : None, # Type of variable/set + "scope" : None, # Scope of the variable/set + "category" : None, # Main category of the variable/set + "subcategory" : None, # Subcategory of the variable/set + "tags" : [], # Search tags applicable to the variable/set + "authors" : [], # Person or organisation responsible for the data + "sources" : [] # URL Sources for the data } - self.data = [] - self.data_info = [] - - if url.find('wikipedia.org') >=0: - wikipedia.scrape(url, self.meta, self.data) - if url.find('macrotrends.net') >= 0: - macrotrends.scrape(url, self.meta, self.data) + self.data = [] # The actual data set + self.info = {} # Temoporary metadata extracted from the data set + if url.find('wikipedia.org') >=0: self.wikipedia.scrape(self, url ) + if url.find('macrotrends.net') >=0: self.macrotrends.scrape(self, url ) #--------[ Scrape Deconstructor ]--------# @@ -76,623 +55,17 @@ class scrape: pass - - #--------[ Show Scrape Data ]--------# - def show(self): - print(self.meta) - for row in self.data: - print(row) - - #--------[ Get Metadata ]--------# def get_meta(self): - - # Break if scrape contains no data - if len(self.data) <= 1: return(1) - - # Process Name - self.meta['name'] = self.meta['name'].lower() - self.meta['name'] = re.sub('and\ dependencies ','',self.meta['name']) - self.meta['name'] = re.sub('list\ of\ ','',self.meta['name']) - self.meta['name'] = re.sub(',','',self.meta['name']) - self.meta['name'] = self.meta['name'].strip() - self.meta['name'] = self.meta['name'].title() - - self.meta['name'] = self.meta['name'].replace('Gdp', 'GDP') - self.meta['name'] = self.meta['name'].replace('Gni', 'GNI') - self.meta['name'] = self.meta['name'].replace('Gnp', 'GNP') - - - # Get Key Names Search Spaces - #self.data_info.append( [key for key in self.data[0]]) - key_search = [] - for i in range(0, len(self.data[0])): - key_search.append( - self.meta['name'].lower() + ' ' + - self.data[0][i].lower() + ' ' + - self.data[1][i].lower() - ) - - self.data_info.append( key_search ) - - - # Process Variable Key Names - key_name = [] - for key in self.data[0]: - if(key.lower().find('country') >=0 or - key.lower().find('countries') >=0 or - key.lower().find('dependency') >=0 ): - key_name.append('country.name') - elif(key.lower().find('year') >=0): - key_name.append('year') - elif(key.lower().find('date') >=0): - key_name.append('date') - else: - - tmp_key = key - tmp_key = tmp_key.lower() - - tmp_key = re.sub(',', '', tmp_key) - tmp_key = re.sub('\[.*\]', '', tmp_key) - tmp_key = re.sub('\(.*\)', '', tmp_key) - tmp_key = re.sub('km2', '', tmp_key) - tmp_key = re.sub('km', '', tmp_key) - tmp_key = re.sub('mi2', '', tmp_key) - tmp_key = re.sub('hectares', '', tmp_key) - tmp_key = re.sub('\ in\ ', '', tmp_key) - tmp_key = re.sub('US\ \$', '', tmp_key) - tmp_key = re.sub('\$', 'dollars', tmp_key) - tmp_key = re.sub('\%', 'percent', tmp_key) - - tmp_key = re.sub('and\ dependencies ', '', tmp_key) - tmp_key = re.sub('list\ of\ countries\ by\ ', '', tmp_key) - - tmp_key = re.sub('thousands\ of', '' ,tmp_key) - tmp_key = re.sub('millions\ of', '' ,tmp_key) - tmp_key = re.sub('billions\ of', '' ,tmp_key) - - tmp_key = re.sub('per\ 100k\ live\ births', '', tmp_key) - tmp_key = re.sub('per\ 100k\ population', '', tmp_key) - - tmp_key = tmp_key.strip() - tmp_key = tmp_key.replace(' ','.') - - if tmp_key.find(self.meta['name'].lower().replace(' ','.')) <0: - if tmp_key != '': - tmp_key = self.meta['name'].lower().replace(' ','.') + '.' + tmp_key - else: - tmp_key = self.meta['name'].lower().replace(' ','.') - - key_name.append( tmp_key ) - self.data_info.append( key_name ) - - - # Process Variable Unit Type - key_unit = [] - for key in self.data_info[0]: - - if( key.find('percent') >=0 or - key.find('perc') >=0 or - key.find('%') >=0 ): - key_unit.append('%') - - elif( key.find('dollar') >=0 or - key.find('$') >=0 ): - key_unit.append('$') - - elif( key.find('euro') >=0 or - key.find('€') >=0 ): - key_unit.append('€') - - elif( key.find('area') >=0 or - key.find('land') >=0 or - key.find('km2') >=0 or - key.find('km²') >=0 or - key.find('mi2') >=0 or - key.find('mi²') >=0 or - key.find('ha') >=0 or - key.find('hectares') >=0 ): - key_unit.append('km²') - - elif( key.find('country') >=0 or - key.find('countries') >=0 or - key.find('dependencies') >=0 ): - key_unit.append('countries') - - elif( key.find('index') >=0 or - key.find('score') >=0 or - key.find('report') >=0 ): - key_unit.append('index') - - elif( key.find('population') >=0 and - key.find('density') <0 and - key.find('access') <0 and - key.find('crime') <0 and - key.find('murder') <0 ): - key_unit.append('people') - - elif( key.find('population') >=0 and - key.find('density') >=0 ): - key_unit.append('people/km²') - - elif( (key.find('death') >=0 or - key.find('mortality') >=0) and - key.find('rate') >=0 and - key.find('infant') <0 and - key.find('maternal') <0 ): - key_unit.append('deaths/1k population') - - elif( key.find('mortality') >=0 and - key.find('rate') >=0 and - key.find('infant') >=0 ): - key_unit.append('deaths/1k live births') - - elif( key.find('mortality') >=0 and - key.find('rate') >=0 and - key.find('maternal') >=0 ): - key_unit.append('deaths/100k live births') - - elif( key.find('suicide') >=0 and - key.find('rate') >=0 ): - key_unit.append('deaths/100k population') - - elif( key.find('life') >=0 and - key.find('expectancy') >=0 ): - key_unit.append('years') - - elif( key.find('birth') >=0 and - key.find('rate') >=0 ): - key_unit.append('births/1k population') - - elif( key.find('fertility') >=0 and - key.find('rate') >=0 ): - key_unit.append('children/women') - - elif( key.find('marriage') >=0 and - key.find('rate') >=0 ): - key_unit.append('marriages/1k population') - - elif( key.find('divorce') >=0 and - key.find('rate') >=0 ): - key_unit.append('divorces/1k population') - - elif( key.find('crime') >=0 and - key.find('rate') >=0 ): - key_unit.append('crimes/100k population') - - elif( key.find('murder') >=0 and - key.find('rate') >=0 ): - key_unit.append('murders/100k population') - - elif( key.find('military') >=0 and - key.find('size') >=0 ): - key_unit.append('personel') - - elif( key.find('immigration') >=0 or - key.find('migration') >=0 or - key.find('refugee') >=0 ): - key_unit.append('people') - - elif( key.find('emissions') >=0 ): - key_unit.append('tonnes') - - else: - key_unit.append('unkown') - self.data_info.append( key_unit ) - - # Process Variable Multiplyer - key_multiplyer = [] - for key in self.data_info[0]: - - if( key.find('%') >=0 or key.find('percent') >=0 ): - key_multiplyer.append( 0.01 ) - - elif( re.search('\$.*k', key) ): key_multiplyer.append(1000) - elif( re.search('\$.*m', key) ): key_multiplyer.append(1000000) - elif( re.search('\$.*b', key) ): key_multiplyer.append(1000000000) - - elif( key.find('thousands of') >=0 ): - key_multiplyer.append(1000) - elif( key.find('millions of') >=0 ): - key_multiplyer.append(1000000) - elif( key.find('bilions of') >=0 ): - key_multiplyer.append(1000000000) - - elif( key.find('mi2') >=0 or key.find('mi²') >=0 ): - key_multiplyer.append(2.59) - elif( key.find('hectare') >=0 ): - key_multiplyer.append(0.01) - - else: - key_multiplyer.append( 1.0 ) - self.data_info.append( key_multiplyer ) - - - # Get Variable Year - key_year = [] - for key in self.data[0]: - if re.match('\d\d\d\d', key): - key_year.append( key ) - elif 'year' in self.data_info[1]: - y1 = self.data[1][self.data_info[1].index('year')] - y2 = self.data[-1][self.data_info[1].index('year')] - if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) ) - if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) ) - elif 'date' in self.data_info[1]: - y1 = self.data[1][self.data_info[1].index('date')].split('-')[0] - y2 = self.data[-1][self.data_info[1].index('date')].split('-')[0] - if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) ) - if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) ) - - else: - key_year.append( date.today().strftime('%Y') ) - self.data_info.append( key_year ) - - - # Get Variable Type - for key in self.data_info[1]: - if key == 'country.name': self.meta['type'] = 'global' - elif key == 'year': self.meta['type'] = 'historical' - elif key == 'date': self.meta['type'] = 'historical' - elif key == 'us.county.fips': self.meta['type'] = 'regional' - elif key == 'uk.constituency.name': self.meta['type'] = 'regional' - if self.meta['type'] == None: self.meta['type'] = 'unkown' - - - # Get Variable Scope - key_scope = [] - for key in self.data_info[1]: - if key.find('male') >=0: key_scope.append( 'male' ) - elif key.find('female') >=0: key_scope.append( 'female' ) - elif key.find('black') >=0: key_scope.append( 'black' ) - elif key.find('white') >=0: key_scope.append( 'white' ) - elif key.find('asian') >=0: key_scope.append( 'asian' ) - elif key.find('native') >=0: key_scope.append( 'native' ) - elif key.find('urban') >=0: key_scope.append( 'urban' ) - elif key.find('rural') >=0: key_scope.append( 'rural' ) - else: key_scope.append( self.meta['type'] ) - self.data_info.append( key_scope ) - - - # Get Variable Category - search = self.meta['name'].join(self.data_info[0]).lower().strip() - - - #--------[ Geographic ]--------# - if( search.find('area') >=0 or - search.find('km2') >=0 ): - self.meta['category'] = 'geographic' - self.meta['subcategory'] = 'area' - - elif( (search.find('arable') >=0 or - search.find('farm') >=0 or - search.find('forrested') >=0) and - search.find('land') >=0 ): - self.meta['category'] = 'geographic' - self.meta['subcategory'] = 'land' - - - #--------[ Demographic ]-------- - elif( search.find('population') >=0 and - search.find('access') <0 and - search.find('murder') <0 and - search.find('crime') <0 and - search.find('hunger') <0 and - search.find('migrat') <0 and - search.find('migrant') <0 ): - self.meta['category'] = 'demogrpahic' - self.meta['subcategory'] = 'population' - - elif( (search.find('birth') >=0 or - search.find('fertility') >=0) and - search.find('mortality') <0 ): - self.meta['category'] = 'demogrpahic' - self.meta['subcategory'] = 'fertility' - - elif( search.find('immigrat') >=0 or - search.find('migrat') >=0 or - search.find('migrant') >=0 or - search.find('refugee') >=0 or - search.find('asylum') >=0 ): - self.meta['category'] = 'demogrpahic' - self.meta['subcategory'] = 'migration' - - - #--------[ Health ]--------# - elif( search.find('life expectancy') >=0 or - search.find('death') >=0 or - search.find('suicide') >=0 or - search.find('mortality') >=0 ): - self.meta['category'] = 'health' - self.meta['subcategory'] = 'mortality' - - elif( search.find('depression') >=0 or - search.find('anxiety') >=0 ): - self.meta['category'] = 'health' - self.meta['subcategory'] = 'psychology' - - elif( search.find('smoking') >= 0 or - search.find('alcohol') >=0 ): - self.meta['category'] = 'health' - self.meta['subcategory'] = 'drugs' - - - #--------[ Economic ]--------# - elif( search.find('gdp') >=0 and - search.find('trade') <0 and - search.find('import') <0 and - search.find('export') <0 and - search.find('invest') <0 and - search.find('spending') <0 and - search.find('manufactur') <0 and - search.find('military') <0 and - search.find('education') <0 and - search.find('health') <0 ): - self.meta['category'] = 'economic' - self.meta['subcategory'] = 'gdp' - - elif( search.find('gni') >=0 or - search.find('gnp') >=0 ): - self.meta['category'] = 'economic' - self.meta['subcategory'] = 'gni' - - elif( search.find('debt') >=0 ): - self.meta['category'] = 'economic' - self.meta['subcategory'] = 'debt' - - elif( search.find('inflation') >=0 ): - self.meta['category'] = 'economic' - self.meta['subcategory'] = 'inflation' - - elif( search.find('health') >=0 and - search.find('spend') >=0 ): - self.meta['category'] = 'economic' - self.meta['subcategory'] = 'welfare' - - elif( search.find('manufactur') >=0 or - search.find('business') >=0 or - search.find('tourism') >=0 ): - self.meta['category'] = 'economic' - self.meta['subcategory'] = 'business' - - elif( search.find('import') >=0 or - search.find('export') >=0 or - search.find('invest') >=0 or - search.find('tariff') >=0 or - search.find('trade') >=0 ): - self.meta['category'] = 'economic' - self.meta['subcategory'] = 'trade' - - elif( search.find('unemployment') >=0 or - search.find('labor') >=0 ): - self.meta['category'] = 'economic' - self.meta['subcategory'] = 'labor-force' - - - #--------[ Development ]--------# - elif( search.find('education') >=0 or - search.find('literacy') >=0 ): - self.meta['category'] = 'development' - self.meta['subcategory'] = 'education' - - elif( search.find('electricity access') >=0 or - search.find('water access') >=0 ): - self.meta['category'] = 'development' - self.meta['subcategory'] = 'infrastructure' - - elif( search.find('development') >=0 or - search.find('competitive') >=0 ): - self.meta['category'] = 'development' - self.meta['subcategory'] = 'technology' - - elif( search.find('hunger') >=0 or - search.find('poverty') >=0 ): - self.meta['category'] = 'development' - self.meta['subcategory'] = 'quality-of-life' - - elif( search.find('co2') >=0 or - search.find('ghg') >=0 or - search.find('emissions') >=0 ): - self.meta['category'] = 'development' - self.meta['subcategory'] = 'emissions' - - elif( search.find('fuel') >=0 or - search.find('coal') >=0 or - search.find('energy') >=0 or - search.find('renewable') >=0 ): - self.meta['category'] = 'development' - self.meta['subcategory'] = 'energy' - - - #--------[ Crime ]--------# - elif( search.find('crime') >=0 or - search.find('homocide') >=0 or - search.find('murder') >=0 ): - self.meta['category'] = 'crime' - - - #--------[ Military ]--------# - elif( search.find('military') >=0 ): - self.meta['category'] = 'military' - - - #--------[ Uncategorised ]--------# - else: - self.meta['category'] = 'uncategorised' - - - # Get Tags - if not self.meta['type'] in self.meta['tags']: self.meta['tags'].append(self.meta['type']) - if not self.meta['category'] in self.meta['tags']: self.meta['tags'].append(self.meta['category']) - if not self.meta['subcategory'] in self.meta['tags']: self.meta['tags'].append(self.meta['subcategory']) - for scope in key_scope: - if not scope in self.meta['tags']: - self.meta['tags'].append(scope) - if scope == 'female' or scope == 'male': - self.meta['tags'].append('gender') - if scope == 'black' or scope == 'white' or scope == 'asian' or scope == 'native': - self.meta['tags'].append('race') - - if 'None' in self.meta['tags']: - self.meta['tags'].pop( self.meta['tags'].index('None') ) - - - - #--------[ Clean Scrape Data ]--------# - def clean(self): - - # Break if scrape contains no data - if len(self.data) <= 1: return(1) - - for x in range(1, len(self.data)): - for y in range(0, len(self.data[x])): - self.data[x][y] = self.data[x][y] - - # Remove any inline notes from data - if isinstance(self.data[x][y], str): - self.data[x][y] = re.sub('\[.*\]','', self.data[x][y]) - self.data[x][y] = re.sub('\(.*\)','', self.data[x][y]) - self.data[x][y] = re.sub(',','', self.data[x][y]) - - # Convert numerical strings to floats - if isinstance(self.data[x][y], str): - self.data[x][y] = self.data[x][y].strip() - if any(i.isdigit() for i in self.data[x][y]): - self.data[x][y] = ''.join([i for i in self.data[x][y] if i.isdigit() or i=='.' or i=='-']) - - # Convert To Float - if isfloat(self.data[x][y]): - self.data[x][y] = float(self.data[x][y]) - - # Apply Variable Multiplyer - self.data[x][y] = self.data[x][y] * self.data_info[3][y] - - # Convert Whole Floats To Integers - if self.data[x][y].is_integer(): - self.data[x][y] = int(self.data[x][y]) - - # Convert non-entries to null - if isinstance(self.data[x][y], str): - if( self.data[x][y].lower().find('not determined') >= 0 or - self.data[x][y].lower().find('negligible') >=0 or - self.data[x][y].lower().find('negligible') >=0 or - self.data[x][y].lower().find('unkown') >= 0 ): - self.data[x][y] = None - if( self.data[x][y] == '-' or - self.data[x][y] == '' ): - self.data[x][y] = None - - - - #--------[ Save Scrape Data ]--------# - def save(self): - - # Break if scrape contains no data - if len(self.data) <= 1: return(1) - - key_main = 0 - for i in range(0, len(self.data_info[1])): - if( self.data[0][i] == 'country.name' >= 0 or - self.data[0][i] == 'year' >= 0 ): - key_main = i - - - for key_data in range(0, len(self.data[0])): - if key_data != key_main: - - - #--------[ Generate Filename ]--------# - filename = self.data_info[1][key_data].replace('.','-') - - filepath = 'data/{0}'.format(self.meta['type']) - if self.meta['type'] == 'historical': filepath += '/' + self.meta['scope'].lower().replace(' ','-') - filepath += '/{0}'.format(self.meta['category']) - if self.meta['subcategory'] != None: filepath += '/' + self.meta['subcategory'] - if len(self.data[0]) > 4: - filepath += '/' + self.meta['name'].lower().replace(' ','-') - - fullpath = filepath + '/' + filename + '.json' - - - #--------[ Check File Directory ]--------# - if not os.path.exists(filepath): - os.makedirs(filepath) - - - #--------[ Open File ]--------# - f = open(fullpath, "w") - f.write('{\n') - - - #--------[ Update Metadata ]--------# - self.meta['units'] = self.data_info[2][key_data] - self.meta['year'] = self.data_info[4][key_data] - - if self.meta['scope'] == None: - self.meta['scope'] = self.data_info[5][key_data] - - #--------[ Write Metadata ] - f.write(' "metadata" : {\n') - for i in self.meta: - if isinstance(self.meta[i], str): - f.write(' "{0}" : "{1}"'.format( i, self.meta[i] )) - elif self.meta[i] == None: - f.write(' "{0}" : null'.format( i )) - elif isinstance(self.meta[i], list): - if len(self.meta[i]) <= 0: - f.write(' "{0}" : []'.format( i )) - elif i == 'tags': - f.write(' "{0}" : ['.format( i )) - for j in self.meta[i]: - f.write('"{0}"'.format( j )) - if j != self.meta[i][-1]: f.write(',') - f.write(']'.format( i )) - else: - f.write(' "{0}" : [\n'.format( i )) - for j in self.meta[i]: - f.write(' "{0}"'.format( j )) - if j != self.meta[i][-1]: f.write(',\n') - else: f.write('\n') - f.write(' ]'.format( i )) - if i != list(self.meta.keys())[-1]: f.write(',\n') - else: f.write('\n') - f.write(' },\n') - - - - #--------[ Write Actual Data ]--------# - f.write(' "data" : [\n') - - if self.meta['type'] == 'historical': - f.write(' ["{0}","{1}"],\n'.format( - self.data_info[1][key_main], - self.meta['id'] + '.' + self.data_info[1][key_data]) - ) - else: - f.write(' ["{0}","{1}"],\n'.format( - self.data_info[1][key_main], - self.data_info[1][key_data]) - ) - - for row in self.data[1:]: - col_a = row[key_main] - col_b = row[key_data] - - if isinstance(col_a, str): col_a = '"{0}"'.format(col_a) - if isinstance(col_b, str): col_b = '"{0}"'.format(col_b) - - if col_a == None: col_a = 'null' - if col_b == None: col_b = 'null' - - f.write(' [{0},{1}]'.format(col_a, col_b)) - - if row != self.data[-1]: f.write(',\n') - else: f.write('\n') - f.write(' ]\n') - - - - #--------[ Final Result ]--------# - f.write('}\n') - f.close() - print(' [{0} data points] -> {1}'.format(len(self.data)-1, fullpath)) + if len(self.data) <= 1: return(-1) # Break if no data + + self.meta_name() # Clean set name + self.meta_search() # Create search-space + self.meta_keys() # Extract variable key-name + self.meta_year() # Extract variable year + self.meta_units() # Extract variable unit + self.meta_multiplyer() # Extract variable multiplyer + self.meta_scope() # Extract variable scope + self.meta_category() # Extract set category + self.meta_type() # Extract set type + self.meta_tags() # Extract set tag diff --git a/masterscraper/core/clean.py b/masterscraper/core/clean.py new file mode 100644 index 0000000..c6ca9fd --- /dev/null +++ b/masterscraper/core/clean.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 + + +import re + +# Check If String Is Number +def isfloat(num): + try: + float(num) + return True + except: + return False + + +#--------[ Clean Scrape Data ]--------# +def clean(self): + if len(self.data) <= 1: return(-1) # Break if no data + + + for x in range(1, len(self.data)): + for y in range(0, len(self.data[x])): + self.data[x][y] = self.data[x][y] + + # Remove any inline notes from data + if isinstance(self.data[x][y], str): + self.data[x][y] = re.sub('\[.*\]','', self.data[x][y]) + self.data[x][y] = re.sub('\(.*\)','', self.data[x][y]) + self.data[x][y] = re.sub(',','', self.data[x][y]) + + # Convert numerical strings to floats + if isinstance(self.data[x][y], str): + self.data[x][y] = self.data[x][y].strip() + if any(i.isdigit() for i in self.data[x][y]): + self.data[x][y] = ''.join([i for i in self.data[x][y] if i.isdigit() or i=='.' or i=='-']) + + # Convert To Float + if isfloat(self.data[x][y]): + self.data[x][y] = float(self.data[x][y]) + + # Apply Variable Multiplyer + self.data[x][y] = self.data[x][y] * self.info['multiplyer'][y] + + # Convert Whole Floats To Integers + if self.data[x][y].is_integer(): + self.data[x][y] = int(self.data[x][y]) + + # Convert non-entries to null + if isinstance(self.data[x][y], str): + if( self.data[x][y].lower().find('not determined') >= 0 or + self.data[x][y].lower().find('negligible') >=0 or + self.data[x][y].lower().find('negligible') >=0 or + self.data[x][y].lower().find('unkown') >= 0 ): + self.data[x][y] = None + if( self.data[x][y] == '-' or + self.data[x][y] == '' ): + self.data[x][y] = None diff --git a/masterscraper/core/get_list.py b/masterscraper/core/get_list.py new file mode 100644 index 0000000..58dc315 --- /dev/null +++ b/masterscraper/core/get_list.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python3 + + + +#--------[ Load URL Scrape List ]--------# +def get_list(filename): + f = open(filename, 'r') + tmp_list = [l.strip() for l in f.readlines()] + f.close() + return(tmp_list) diff --git a/masterscraper/core/meta_category.py b/masterscraper/core/meta_category.py new file mode 100644 index 0000000..08fa0f6 --- /dev/null +++ b/masterscraper/core/meta_category.py @@ -0,0 +1,169 @@ +#!/usr/bin/env python3 + + +#--------[ Extract Category Information ]--------# +def meta_category(self): + search = self.meta['name'].join(self.info['search']).lower().strip() + + + #--------[ Geographic ]--------# + if( search.find('area') >=0 or + search.find('km2') >=0 ): + self.meta['category'] = 'geographic' + self.meta['subcategory'] = 'area' + + elif( (search.find('arable') >=0 or + search.find('farm') >=0 or + search.find('forrested') >=0) and + search.find('land') >=0 ): + self.meta['category'] = 'geographic' + self.meta['subcategory'] = 'land' + + + #--------[ Demographic ]-------- + elif( search.find('population') >=0 and + search.find('access') <0 and + search.find('murder') <0 and + search.find('crime') <0 and + search.find('hunger') <0 and + search.find('migrat') <0 and + search.find('migrant') <0 ): + self.meta['category'] = 'demogrpahic' + self.meta['subcategory'] = 'population' + + elif( (search.find('birth') >=0 or + search.find('fertility') >=0) and + search.find('mortality') <0 ): + self.meta['category'] = 'demogrpahic' + self.meta['subcategory'] = 'fertility' + + elif( search.find('immigrat') >=0 or + search.find('migrat') >=0 or + search.find('migrant') >=0 or + search.find('refugee') >=0 or + search.find('asylum') >=0 ): + self.meta['category'] = 'demogrpahic' + self.meta['subcategory'] = 'migration' + + + #--------[ Health ]--------# + elif( search.find('life expectancy') >=0 or + search.find('death') >=0 or + search.find('suicide') >=0 or + search.find('mortality') >=0 ): + self.meta['category'] = 'health' + self.meta['subcategory'] = 'mortality' + + elif( search.find('depression') >=0 or + search.find('anxiety') >=0 ): + self.meta['category'] = 'health' + self.meta['subcategory'] = 'psychology' + + elif( search.find('smoking') >= 0 or + search.find('alcohol') >=0 ): + self.meta['category'] = 'health' + self.meta['subcategory'] = 'drugs' + + + #--------[ Economic ]--------# + elif( search.find('gdp') >=0 and + search.find('trade') <0 and + search.find('import') <0 and + search.find('export') <0 and + search.find('invest') <0 and + search.find('spending') <0 and + search.find('manufactur') <0 and + search.find('military') <0 and + search.find('education') <0 and + search.find('health') <0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'gdp' + + elif( search.find('gni') >=0 or + search.find('gnp') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'gni' + + elif( search.find('debt') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'debt' + + elif( search.find('inflation') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'inflation' + + elif( search.find('health') >=0 and + search.find('spend') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'welfare' + + elif( search.find('manufactur') >=0 or + search.find('business') >=0 or + search.find('tourism') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'business' + + elif( search.find('import') >=0 or + search.find('export') >=0 or + search.find('invest') >=0 or + search.find('tariff') >=0 or + search.find('trade') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'trade' + + elif( search.find('unemployment') >=0 or + search.find('labor') >=0 ): + self.meta['category'] = 'economic' + self.meta['subcategory'] = 'labor-force' + + + #--------[ Development ]--------# + elif( search.find('education') >=0 or + search.find('literacy') >=0 ): + self.meta['category'] = 'development' + self.meta['subcategory'] = 'education' + + elif( search.find('electricity access') >=0 or + search.find('water access') >=0 ): + self.meta['category'] = 'development' + self.meta['subcategory'] = 'infrastructure' + + elif( search.find('development') >=0 or + search.find('competitive') >=0 ): + self.meta['category'] = 'development' + self.meta['subcategory'] = 'technology' + + elif( search.find('hunger') >=0 or + search.find('poverty') >=0 ): + self.meta['category'] = 'development' + self.meta['subcategory'] = 'quality-of-life' + + elif( search.find('co2') >=0 or + search.find('ghg') >=0 or + search.find('emissions') >=0 ): + self.meta['category'] = 'development' + self.meta['subcategory'] = 'emissions' + + elif( search.find('fuel') >=0 or + search.find('coal') >=0 or + search.find('energy') >=0 or + search.find('renewable') >=0 ): + self.meta['category'] = 'development' + self.meta['subcategory'] = 'energy' + + + #--------[ Crime ]--------# + elif( search.find('crime') >=0 or + search.find('homocide') >=0 or + search.find('murder') >=0 ): + self.meta['category'] = 'crime' + + + #--------[ Military ]--------# + elif( search.find('military') >=0 ): + self.meta['category'] = 'military' + + + #--------[ Uncategorised ]--------# + else: + self.meta['category'] = 'uncategorised' diff --git a/masterscraper/core/meta_keys.py b/masterscraper/core/meta_keys.py new file mode 100644 index 0000000..4dd43e2 --- /dev/null +++ b/masterscraper/core/meta_keys.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + + +import re + + +#--------[ Process Variable Key Names ]-------- +def meta_keys(self): + key_name = [] + + for key in self.data[0]: + if(key.lower().find('country') >=0 or + key.lower().find('countries') >=0 or + key.lower().find('dependency') >=0 ): + key_name.append('country.name') + elif(key.lower().find('year') >=0): + key_name.append('year') + elif(key.lower().find('date') >=0): + key_name.append('date') + else: + + tmp_key = key + tmp_key = tmp_key.lower() + + tmp_key = re.sub(',', '', tmp_key) + tmp_key = re.sub('\[.*\]', '', tmp_key) + tmp_key = re.sub('\(.*\)', '', tmp_key) + tmp_key = re.sub('km2', '', tmp_key) + tmp_key = re.sub('km', '', tmp_key) + tmp_key = re.sub('mi2', '', tmp_key) + tmp_key = re.sub('hectares', '', tmp_key) + tmp_key = re.sub('\ in\ ', '', tmp_key) + tmp_key = re.sub('US\ \$', '', tmp_key) + tmp_key = re.sub('\$', 'dollars', tmp_key) + tmp_key = re.sub('\%', 'percent', tmp_key) + + tmp_key = re.sub('and\ dependencies ', '', tmp_key) + tmp_key = re.sub('list\ of\ countries\ by\ ', '', tmp_key) + + tmp_key = re.sub('thousands\ of', '' ,tmp_key) + tmp_key = re.sub('millions\ of', '' ,tmp_key) + tmp_key = re.sub('billions\ of', '' ,tmp_key) + + tmp_key = re.sub('per\ 100k\ live\ births', '', tmp_key) + tmp_key = re.sub('per\ 100k\ population', '', tmp_key) + + tmp_key = tmp_key.strip() + tmp_key = tmp_key.replace(' ','.') + + if tmp_key.find(self.meta['name'].lower().replace(' ','.')) <0: + if tmp_key != '': + tmp_key = self.meta['name'].lower().replace(' ','.') + '.' + tmp_key + else: + tmp_key = self.meta['name'].lower().replace(' ','.') + + #--------[ Add Name To Info Array ]--------# + key_name.append( tmp_key ) + self.info['keys'] = key_name diff --git a/masterscraper/core/meta_multiplyer.py b/masterscraper/core/meta_multiplyer.py new file mode 100644 index 0000000..36fe0a4 --- /dev/null +++ b/masterscraper/core/meta_multiplyer.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + + + +import re + + + +#--------[ Process Variable Multiplyer +def meta_multiplyer(self): + key_multiplyer = [] + for key in self.info['search']: + + if( key.find('%') >=0 or key.find('percent') >=0 ): + key_multiplyer.append( 0.01 ) + + elif( re.search('\$.*k', key) ): key_multiplyer.append(1000) + elif( re.search('\$.*m', key) ): key_multiplyer.append(1000000) + elif( re.search('\$.*b', key) ): key_multiplyer.append(1000000000) + + elif( key.find('thousands of') >=0 ): + key_multiplyer.append(1000) + elif( key.find('millions of') >=0 ): + key_multiplyer.append(1000000) + elif( key.find('bilions of') >=0 ): + key_multiplyer.append(1000000000) + + elif( key.find('mi2') >=0 or key.find('mi²') >=0 ): + key_multiplyer.append(2.59) + elif( key.find('hectare') >=0 ): + key_multiplyer.append(0.01) + + else: + key_multiplyer.append( 1.0 ) + + self.info['multiplyer'] = key_multiplyer diff --git a/masterscraper/core/meta_name.py b/masterscraper/core/meta_name.py new file mode 100644 index 0000000..f6932af --- /dev/null +++ b/masterscraper/core/meta_name.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + + + +import re + + + +#--------[ Process Vaiable Set Names ]-------#- +def meta_name(self): + self.meta['name'] = self.meta['name'].lower() + self.meta['name'] = re.sub('and\ dependencies ','',self.meta['name']) + self.meta['name'] = re.sub('list\ of\ ','',self.meta['name']) + self.meta['name'] = re.sub(',','',self.meta['name']) + self.meta['name'] = self.meta['name'].strip() + self.meta['name'] = self.meta['name'].title() + + self.meta['name'] = self.meta['name'].replace('Gdp', 'GDP') + self.meta['name'] = self.meta['name'].replace('Gni', 'GNI') + self.meta['name'] = self.meta['name'].replace('Gnp', 'GNP') + + diff --git a/masterscraper/core/meta_scope.py b/masterscraper/core/meta_scope.py new file mode 100644 index 0000000..e12c996 --- /dev/null +++ b/masterscraper/core/meta_scope.py @@ -0,0 +1,17 @@ +#!/usr/bin/env python3 + + +#--------[ Get Variable Scope ]--------# +def meta_scope(self): + key_scope = [] + for key in self.info['search']: + if key.find('female') >=0: key_scope.append( 'female' ) + elif key.find('male') >=0: key_scope.append( 'male' ) + elif key.find('black') >=0: key_scope.append( 'black' ) + elif key.find('white') >=0: key_scope.append( 'white' ) + elif key.find('asian') >=0: key_scope.append( 'asian' ) + elif key.find('native') >=0: key_scope.append( 'native' ) + elif key.find('urban') >=0: key_scope.append( 'urban' ) + elif key.find('rural') >=0: key_scope.append( 'rural' ) + else: key_scope.append( self.meta['type'] ) + self.info['scope'] = key_scope diff --git a/masterscraper/core/meta_search.py b/masterscraper/core/meta_search.py new file mode 100644 index 0000000..dbefb30 --- /dev/null +++ b/masterscraper/core/meta_search.py @@ -0,0 +1,15 @@ +#!/usr/bin/env python3 + + +#--------[ Create Variable Search Space ]--------# +def meta_search(self): + key_search = [] + + for i in range(0, len(self.data[0])): + key_search.append( + self.meta['name'].lower() + ' ' + + self.data[0][i].lower() + ' ' + + self.data[1][i].lower() + ) + + self.info['search'] = key_search diff --git a/masterscraper/core/meta_tags.py b/masterscraper/core/meta_tags.py new file mode 100644 index 0000000..70ee3bd --- /dev/null +++ b/masterscraper/core/meta_tags.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 + +#--------[ Extract Tag Information ]--------# +def meta_tags(self): + if not self.meta['type'] in self.meta['tags']: self.meta['tags'].append(self.meta['type']) + if not self.meta['category'] in self.meta['tags']: self.meta['tags'].append(self.meta['category']) + if not self.meta['subcategory'] in self.meta['tags']: self.meta['tags'].append(self.meta['subcategory']) + + if self.meta['scope'] != None: + if not self.meta['scope'].lower() in self.meta['tags']: + self.meta['tags'].append( self.meta['scope'].lower() ) + + for scope in self.info['scope']: + if not scope in self.meta['tags']: + self.meta['tags'].append(scope) + if scope == 'female' or scope == 'male': + self.meta['tags'].append('gender') + if scope == 'black' or scope == 'white' or scope == 'asian' or scope == 'native': + self.meta['tags'].append('race') + + if None in self.meta['tags']: + self.meta['tags'].pop( self.meta['tags'].index(None) ) + if 'None' in self.meta['tags']: + self.meta['tags'].pop( self.meta['tags'].index('None') ) diff --git a/masterscraper/core/meta_type.py b/masterscraper/core/meta_type.py new file mode 100644 index 0000000..8f1aa5e --- /dev/null +++ b/masterscraper/core/meta_type.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + + +#--------[ Get Variable Type ]--------# +def meta_type(self): + for key in self.info['keys']: + if key == 'country.name': self.meta['type'] = 'global' + elif key == 'year': self.meta['type'] = 'historical' + elif key == 'date': self.meta['type'] = 'historical' + elif key == 'us.county.fips': self.meta['type'] = 'regional' + elif key == 'uk.constituency.name': self.meta['type'] = 'regional' + if self.meta['type'] == None: self.meta['type'] = 'unkown' diff --git a/masterscraper/core/meta_units.py b/masterscraper/core/meta_units.py new file mode 100644 index 0000000..d9fae2d --- /dev/null +++ b/masterscraper/core/meta_units.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +#--------[ Process Variable Unit Type ]--------# + +def meta_units(self): + key_unit = [] + for key in self.info['search']: + + if( key.find('percent') >=0 or + key.find('perc') >=0 or + key.find('%') >=0 ): + key_unit.append('%') + + elif( key.find('dollar') >=0 or + key.find('$') >=0 ): + key_unit.append('$') + + elif( key.find('euro') >=0 or + key.find('€') >=0 ): + key_unit.append('€') + + elif( key.find('area') >=0 or + key.find('land') >=0 or + key.find('km2') >=0 or + key.find('km²') >=0 or + key.find('mi2') >=0 or + key.find('mi²') >=0 or + key.find('ha') >=0 or + key.find('hectares') >=0 ): + key_unit.append('km²') + + elif( key.find('country') >=0 or + key.find('countries') >=0 or + key.find('dependencies') >=0 ): + key_unit.append('countries') + + elif( key.find('index') >=0 or + key.find('score') >=0 or + key.find('report') >=0 ): + key_unit.append('index') + + elif( key.find('population') >=0 and + key.find('density') <0 and + key.find('access') <0 and + key.find('crime') <0 and + key.find('murder') <0 ): + key_unit.append('people') + + elif( key.find('population') >=0 and + key.find('density') >=0 ): + key_unit.append('people/km²') + + elif( (key.find('death') >=0 or + key.find('mortality') >=0) and + key.find('rate') >=0 and + key.find('infant') <0 and + key.find('maternal') <0 ): + key_unit.append('deaths/1k population') + + elif( key.find('mortality') >=0 and + key.find('rate') >=0 and + key.find('infant') >=0 ): + key_unit.append('deaths/1k live births') + + elif( key.find('mortality') >=0 and + key.find('rate') >=0 and + key.find('maternal') >=0 ): + key_unit.append('deaths/100k live births') + + elif( key.find('suicide') >=0 and + key.find('rate') >=0 ): + key_unit.append('deaths/100k population') + + elif( key.find('life') >=0 and + key.find('expectancy') >=0 ): + key_unit.append('years') + + elif( key.find('birth') >=0 and + key.find('rate') >=0 ): + key_unit.append('births/1k population') + + elif( key.find('fertility') >=0 and + key.find('rate') >=0 ): + key_unit.append('children/women') + + elif( key.find('marriage') >=0 and + key.find('rate') >=0 ): + key_unit.append('marriages/1k population') + + elif( key.find('divorce') >=0 and + key.find('rate') >=0 ): + key_unit.append('divorces/1k population') + + elif( key.find('crime') >=0 and + key.find('rate') >=0 ): + key_unit.append('crimes/100k population') + + elif( key.find('murder') >=0 and + key.find('rate') >=0 ): + key_unit.append('murders/100k population') + + elif( key.find('military') >=0 and + key.find('size') >=0 ): + key_unit.append('personel') + + elif( key.find('immigration') >=0 or + key.find('migration') >=0 or + key.find('refugee') >=0 ): + key_unit.append('people') + + elif( key.find('emissions') >=0 ): + key_unit.append('tonnes') + + else: + key_unit.append('unkown') + + self.info['units'] = key_unit diff --git a/masterscraper/core/meta_year.py b/masterscraper/core/meta_year.py new file mode 100644 index 0000000..9029b8e --- /dev/null +++ b/masterscraper/core/meta_year.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 + + +from datetime import date +import re + + +#--------[ Get Variable Year ]--------# +def meta_year(self): + key_year = [] + for key in self.data[0]: + + if re.match('\d\d\d\d', key): + key_year.append( key ) + + elif 'year' in self.info['keys']: + y1 = self.data[1][self.info['keys'].index('year')] + y2 = self.data[-1][self.info['keys'].index('year')] + if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) ) + if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) ) + + elif 'date' in self.info['keys']: + y1 = self.data[1][self.info['keys'].index('date')].split('-')[0] + y2 = self.data[-1][self.info['keys'].index('date')].split('-')[0] + if y1 <= y2: key_year.append( '{0}-{1}'.format(y1,y2) ) + if y1 > y2: key_year.append( '{0}-{1}'.format(y2,y1) ) + + else: + key_year.append( date.today().strftime('%Y') ) + + self.info['year'] = key_year diff --git a/masterscraper/core/save.py b/masterscraper/core/save.py new file mode 100644 index 0000000..4dddc2a --- /dev/null +++ b/masterscraper/core/save.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + + +import os + + +#--------[ Save Scrape Data ]--------# +def save(self): + if len(self.data) <= 1: return(-1) # Break if no data + + + key_main = 0 + for i in range(0, len(self.info['keys'])): + if( self.data[0][i] == 'country.name' >= 0 or + self.data[0][i] == 'year' >= 0 ): + key_main = i + + + for key_data in range(0, len(self.data[0])): + if key_data != key_main: + + + #--------[ Generate Filename ]--------# + filename = self.info['keys'][key_data].replace('.','-') + + filepath = 'data/{0}'.format(self.meta['type']) + if self.meta['type'] == 'historical': filepath += '/' + self.meta['scope'].lower().replace(' ','-') + filepath += '/{0}'.format(self.meta['category']) + if self.meta['subcategory'] != None: filepath += '/' + self.meta['subcategory'] + if len(self.data[0]) > 4: + filepath += '/' + self.meta['name'].lower().replace(' ','-') + + fullpath = filepath + '/' + filename + '.json' + + + #--------[ Check File Directory ]--------# + if not os.path.exists(filepath): + os.makedirs(filepath) + + + #--------[ Open File ]--------# + f = open(fullpath, "w") + f.write('{\n') + + + #--------[ Update Metadata ]--------# + self.meta['units'] = self.info['units'][key_data] + self.meta['year'] = self.info['year'][key_data] + + if self.meta['scope'] == None: + self.meta['scope'] = self.info['scope'][key_data] + + #--------[ Write Metadata ] + f.write(' "metadata" : {\n') + for i in self.meta: + if isinstance(self.meta[i], str): + f.write(' "{0}" : "{1}"'.format( i, self.meta[i] )) + elif self.meta[i] == None: + f.write(' "{0}" : null'.format( i )) + elif isinstance(self.meta[i], list): + if len(self.meta[i]) <= 0: + f.write(' "{0}" : []'.format( i )) + elif i == 'tags': + f.write(' "{0}" : ['.format( i )) + for j in self.meta[i]: + f.write('"{0}"'.format( j )) + if j != self.meta[i][-1]: f.write(',') + f.write(']'.format( i )) + else: + f.write(' "{0}" : [\n'.format( i )) + for j in self.meta[i]: + f.write(' "{0}"'.format( j )) + if j != self.meta[i][-1]: f.write(',\n') + else: f.write('\n') + f.write(' ]'.format( i )) + if i != list(self.meta.keys())[-1]: f.write(',\n') + else: f.write('\n') + f.write(' },\n') + + + + #--------[ Write Actual Data ]--------# + f.write(' "data" : [\n') + + if self.meta['type'] == 'historical': + f.write(' ["{0}","{1}"],\n'.format( + self.info['keys'][key_main], + self.meta['id'] + '.' + self.info['keys'][key_data]) + ) + else: + f.write(' ["{0}","{1}"],\n'.format( + self.info['keys'][key_main], + self.info['keys'][key_data]) + ) + + for row in self.data[1:]: + col_a = row[key_main] + col_b = row[key_data] + + if isinstance(col_a, str): col_a = '"{0}"'.format(col_a) + if isinstance(col_b, str): col_b = '"{0}"'.format(col_b) + + if col_a == None: col_a = 'null' + if col_b == None: col_b = 'null' + + f.write(' [{0},{1}]'.format(col_a, col_b)) + + if row != self.data[-1]: f.write(',\n') + else: f.write('\n') + f.write(' ]\n') + + + + #--------[ Final Result ]--------# + f.write('}\n') + f.close() + print(' [{0} data points] -> {1}'.format(len(self.data)-1, fullpath)) diff --git a/masterscraper/core/show.py b/masterscraper/core/show.py new file mode 100644 index 0000000..03853f2 --- /dev/null +++ b/masterscraper/core/show.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python3 + + + +#--------[ Show Scrape Data ]--------# +def show(self): + print(self.meta) + for row in self.data: + print(row) diff --git a/masterscraper/macrotrends/__init__.py b/masterscraper/macrotrends/__init__.py index 7b70491..5ed19c1 100644 --- a/masterscraper/macrotrends/__init__.py +++ b/masterscraper/macrotrends/__init__.py @@ -42,33 +42,32 @@ def scrapelist(): print(url['href']) #break - print('\nScraping {0} datasets from MacroTrends\n'.format( len(full_list) )) + print('\nScraping {0} self.datasets from MacroTrends\n'.format( len(full_list) )) return full_list -def scrape(url, meta, data): - #--------[ Get Page From URL ]--------# +def scrape(self, url): +#--------[ Get Page From URL ]--------# soup = getpage(url) - #--------[ Get Metadata ]--------# url_parts = url.split('/') - meta['name'] = url_parts[-1].replace('-',' ').title() + self.meta['name'] = url_parts[-1].replace('-',' ').title() soup_desc = getpage( 'https://www.macrotrends.net/countries/ranking/' + url.split('/')[-1] ) - meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text + self.meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text - meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text ) + self.meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text ) - meta['sources'].append( url ) + self.meta['sources'].append( url ) - meta['scope'] = url_parts[-2].replace('-',' ').title() + self.meta['scope'] = url_parts[-2].replace('-',' ').title() - meta['id'] = url_parts[-3].lower() + self.meta['id'] = url_parts[-3].lower() @@ -80,40 +79,40 @@ def scrape(url, meta, data): for tr in table.find_all('tr'): row = [ th.text.strip() for th in tr.find_all('th')] if len(row) > 1: - data.append( row ) + self.data.append( row ) # Get Table Data for tr in table.find_all('tr'): row = [ td.text.strip() for td in tr.find_all('td')] if len(row) > 1: - data.append( row ) + self.data.append( row ) #--------[ Process Table ]-------- # Delete rows with incorrect number of variables key = 0 - key_len = len(data) + key_len = len(self.data) while key < key_len: - if len(data[key]) != len(data[0]): - data.pop(key) + if len(self.data[key]) != len(self.data[0]): + self.data.pop(key) key = key-1 key = key+1 - key_len = len(data) + key_len = len(self.data) # Delete unwanted table columns key = 0 - key_len = len(data[0]) + key_len = len(self.data[0]) while key < key_len: flag = False - if data[0][key].lower().find('rank') >=0: flag = True - if data[0][key].lower().find('change') >=0: flag = True - if data[0][key].lower().find('notes') >=0: flag = True - if data[0][key].lower().find('gap') >=0: flag = True - if data[0][key].lower().find('Δ') >=0: flag = True - if data[0][key].lower().find('growth') >=0: flag = True + if self.data[0][key].lower().find('rank') >=0: flag = True + if self.data[0][key].lower().find('change') >=0: flag = True + if self.data[0][key].lower().find('notes') >=0: flag = True + if self.data[0][key].lower().find('gap') >=0: flag = True + if self.data[0][key].lower().find('Δ') >=0: flag = True + if self.data[0][key].lower().find('growth') >=0: flag = True if flag: - for i in range(0, len(data)): - data[i].pop(key) + for i in range(0, len(self.data)): + self.data[i].pop(key) key = key-1 key = key+1 - key_len = len(data[0]) + key_len = len(self.data[0]) diff --git a/masterscraper/wikipedia/__init__.py b/masterscraper/wikipedia/__init__.py index c4877c2..71314db 100644 --- a/masterscraper/wikipedia/__init__.py +++ b/masterscraper/wikipedia/__init__.py @@ -22,15 +22,15 @@ def getpage(url): return soup -def scrape(url, meta, data): +def scrape(self, url): #--------[ Get Page From URL ]--------# soup = getpage(url) #--------[ Get Metadata ]--------# - meta['name'] = soup.find('span', class_='mw-page-title-main').text - meta['description'] = re.sub('\[.*?\]', '', soup.select('p')[0].getText().strip()).replace('\n',' ') - meta['sources'].append(url) + self.meta['name'] = soup.find('span', class_='mw-page-title-main').text + self.meta['description'] = re.sub('\[.*?\]', '', soup.select('p')[0].getText().strip()).replace('\n',' ') + self.meta['sources'].append(url) @@ -41,39 +41,39 @@ def scrape(url, meta, data): for tr in table.find_all('tr'): row = [ th.text.strip() for th in tr.find_all('th')] if len(row) > 1: - data.append( row ) + self.data.append( row ) # Get Table Data for tr in table.find_all('tr'): row = [ td.text.strip() for td in tr.find_all('td')] if len(row) > 1: - data.append( row ) + self.data.append( row ) #--------[ Process Table ]-------- # Delete rows with incorrect number of variables key = 0 - key_len = len(data) + key_len = len(self.data) while key < key_len: - if len(data[key]) != len(data[0]): - data.pop(key) + if len(self.data[key]) != len(self.data[0]): + self.data.pop(key) key = key-1 key = key+1 - key_len = len(data) + key_len = len(self.data) # Delete unwanted table columns key = 0 - key_len = len(data[0]) + key_len = len(self.data[0]) while key < key_len: flag = False - if data[0][key].lower().find('rank') >=0: flag = True - if data[0][key].lower().find('change') >=0: flag = True - if data[0][key].lower().find('notes') >=0: flag = True - if data[0][key].lower().find('gap') >=0: flag = True - if data[0][key].lower().find('Δ') >=0: flag = True + if self.data[0][key].lower().find('rank') >=0: flag = True + if self.data[0][key].lower().find('change') >=0: flag = True + if self.data[0][key].lower().find('notes') >=0: flag = True + if self.data[0][key].lower().find('gap') >=0: flag = True + if self.data[0][key].lower().find('Δ') >=0: flag = True if flag: - for i in range(0, len(data)): - data[i].pop(key) + for i in range(0, len(self.data)): + self.data[i].pop(key) key = key-1 key = key+1 - key_len = len(data[0]) + key_len = len(self.data[0]) diff --git a/scrape_single.py b/scrape_single.py index 77fab79..8764ea5 100644 --- a/scrape_single.py +++ b/scrape_single.py @@ -6,6 +6,7 @@ import masterscraper as ms scrape = ms.scrape('https://www.macrotrends.net/countries/CHN/china/net-migration') +#scrape = ms.scrape('https://www.macrotrends.net/countries/CHN/china/electricity-access-statistics') scrape.get_meta() scrape.clean() scrape.save()