From fbf29f97592ebd5a368820fe0f312c919e2473b5 Mon Sep 17 00:00:00 2001 From: colttaine Date: Sun, 5 Mar 2023 19:53:10 +1100 Subject: [PATCH] Pull correct description data from macrotrends --- masterscraper/__init__.py | 45 +++++++++++++++++++++++++-- masterscraper/macrotrends/__init__.py | 3 +- scrape_list.py | 3 -- scrape_single.py | 2 +- 4 files changed, 45 insertions(+), 8 deletions(-) diff --git a/masterscraper/__init__.py b/masterscraper/__init__.py index f7503b0..31ba522 100644 --- a/masterscraper/__init__.py +++ b/masterscraper/__init__.py @@ -97,6 +97,10 @@ class scrape: self.meta['name'] = self.meta['name'].strip() self.meta['name'] = self.meta['name'].title() + self.meta['name'] = self.meta['name'].replace('Gdp', 'GDP') + self.meta['name'] = self.meta['name'].replace('Gni', 'GNI') + self.meta['name'] = self.meta['name'].replace('Gnp', 'GNP') + # Get Key Names self.data_info.append( [key for key in self.data[0]] ) @@ -152,12 +156,14 @@ class scrape: key.lower().find('€') >=0 ): key_unit.append('€') - elif( key.lower().find('km2') >=0 or + elif( key.lower().find('area') >=0 or + key.lower().find('land') >=0 or + key.lower().find('km2') >=0 or key.lower().find('km²') >=0 or key.lower().find('mi2') >=0 or key.lower().find('mi²') >=0 ): - key_unit.append('km²') + elif( key.lower().find('country') >=0 or key.lower().find('countries') >=0 or key.lower().find('dependencies') >=0 ): @@ -178,13 +184,29 @@ class scrape: elif( key.lower().find('death') >=0 or key.lower().find('mortality') >=0 and - key.lower().find('rate') >=0 ): + key.lower().find('rate') >=0 and + key.lower().find('infant') <0 and + key.lower().find('maternal') <0 ): key_unit.append('deaths/1k population') + elif( key.lower().find('mortality') >=0 and + key.lower().find('rate') >=0 and + key.lower().find('infant') >=0 ): + key_unit.append('deaths/100k live births') + + elif( key.lower().find('mortality') >=0 and + key.lower().find('rate') >=0 and + key.lower().find('maternal') >=0 ): + key_unit.append('deaths/1k live births') + elif( key.lower().find('birth') >=0 and key.lower().find('rate') >=0 ): key_unit.append('births/1k population') + elif( key.lower().find('fertility') >=0 and + key.lower().find('rate') >=0 ): + key_unit.append('children/women') + elif( key.lower().find('marriage') >=0 and key.lower().find('rate') >=0 ): key_unit.append('marriages/1k population') @@ -201,6 +223,19 @@ class scrape: key.lower().find('rate') >=0 ): key_unit.append('murders/100k population') + elif( key.lower().find('military') >=0 and + key.lower().find('size') >=0 ): + key_unit.append('personel') + + elif( key.lower().find('immigration') >=0 or + key.lower().find('migration') >=0 or + key.lower().find('refugee') >=0 and + key.lower().find('rate') <0 ): + key_unit.append('people') + + elif( key.lower().find('emissions') >=0 ): + key_unit.append('tonnes') + else: key_unit.append('unkown') self.data_info.append( key_unit ) @@ -349,6 +384,10 @@ class scrape: search.find('murder') >=0 ): self.meta['category'] = 'development' + #--------[ Crime ]--------# + elif( search.find('military') >=0 ): + self.meta['category'] = 'military' + #--------[ Uncategorised ]--------# else: self.meta['category'] = 'uncategorised' diff --git a/masterscraper/macrotrends/__init__.py b/masterscraper/macrotrends/__init__.py index a7df950..ff4bb8e 100644 --- a/masterscraper/macrotrends/__init__.py +++ b/masterscraper/macrotrends/__init__.py @@ -59,7 +59,8 @@ def scrape(url, meta, data): meta['name'] = url_parts[-1].replace('-',' ').title() - meta['description'] = soup.find('h1').text + soup_desc = getpage( 'https://www.macrotrends.net/countries/ranking/' + url.split('/')[-1] ) + meta['description'] = soup_desc.find('div',class_='navigation_tabs').find('span').text meta['authors'].append( soup.find('span', string='Data Source: ').next_sibling.text ) diff --git a/scrape_list.py b/scrape_list.py index ba2eb12..ada423c 100644 --- a/scrape_list.py +++ b/scrape_list.py @@ -1,9 +1,6 @@ #!/usr/bin/python3 -import masterscraper as ms - - scrapelist = ms.scrapelist('conf/wikipedia.txt') for url in scrapelist: scrape = ms.scrape(url) diff --git a/scrape_single.py b/scrape_single.py index 1851576..50887e2 100644 --- a/scrape_single.py +++ b/scrape_single.py @@ -5,7 +5,7 @@ import masterscraper as ms -scrape = ms.scrape('https://www.macrotrends.net/countries/TUR/turkey/population') +scrape = ms.scrape('https://www.macrotrends.net/countries/SGP/singapore/gdp-per-capita') scrape.get_meta() scrape.clean() scrape.save()