masterscraper/masterscraper/macrotrends/__init__.py

#!/usr/bin/python3

import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from datetime import date


def getpage(url):
    #--------[ Get Page From URL ]--------#
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    page = requests.get(url, headers)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup


def scrapelist():
    soup = getpage('https://www.macrotrends.net/countries/topic-overview')

    # Get URL list of global metrics
    links = []
    for table in soup.find_all('div', class_='col-xs-3'):
        for link in table.find_all('a'):
            links.append('https://www.macrotrends.net' + link['href'])

    # Get full country list for each global metric
    full_list = []
    for link in links:
        soup = getpage(link)
        table = soup.find('div', class_='col-xs-12')
        for url in table.find_all('a'):
            full_list.append('https://www.macrotrends.net' + url['href'])
            print(url['href'])
        #break

    print('\nScraping {0} self.datasets from MacroTrends\n'.format( len(full_list) ))

    return full_list


def scrape(self, url):
#--------[ Get Page From URL ]--------#
    soup = getpage(url)

    #--------[ Get Metadata ]--------#
    url_parts = url.split('/')

    self.meta['name'] = url_parts[-1].replace('-',' ').title()

    soup_desc = getpage( 'https://www.macrotrends.net/countries/ranking/' + url.split('/')[-1] )
    self.meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text

    self.meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text )

    self.meta['sources'].append( url )

    self.meta['scope'] = url_parts[-2].replace('-',' ').title()

    self.meta['id'] = url_parts[-3].lower()


    #--------[ Extract Table ]--------#
    table = soup.find('div', class_='col-xs-6')
    table = table.find('table', class_='historical_data_table')

    # Get Table Headings
    for tr in table.find_all('tr'):
        row = [ th.text.strip() for th in tr.find_all('th')]
        if len(row) > 1:
            self.data.append( row )

    # Get Table Data
    for tr in table.find_all('tr'):
        row = [ td.text.strip() for td in tr.find_all('td')]
        if len(row) > 1:
            self.data.append( row )

    #--------[ Process Table ]--------

    # Delete rows with incorrect number of variables
    key = 0
    key_len = len(self.data)
    while key < key_len:
        if len(self.data[key]) != len(self.data[0]):
            self.data.pop(key)
            key = key-1
        key = key+1
        key_len = len(self.data)

    # Delete unwanted table columns
    key = 0
    key_len = len(self.data[0])
    while key < key_len:
        flag = False
        if self.data[0][key].lower().find('rank') >=0: flag = True
        if self.data[0][key].lower().find('change') >=0: flag = True
        if self.data[0][key].lower().find('notes') >=0: flag = True
        if self.data[0][key].lower().find('gap') >=0: flag = True
        if self.data[0][key].lower().find('Δ') >=0: flag = True
        if self.data[0][key].lower().find('growth') >=0: flag = True
        if flag:
            for i in range(0, len(self.data)):
                self.data[i].pop(key)
            key = key-1
        key = key+1
        key_len = len(self.data[0])