masterscraper/masterscraper/core/clean.py

57 lines
2.0 KiB
Python

#!/usr/bin/env python3
import re
# Check If String Is Number
def isfloat(num):
try:
float(num)
return True
except:
return False
#--------[ Clean Scrape Data ]--------#
def clean(self):
if len(self.data) <= 1: return(-1) # Break if no data
for x in range(1, len(self.data)):
for y in range(0, len(self.data[x])):
self.data[x][y] = self.data[x][y]
# Remove any inline notes from data
if isinstance(self.data[x][y], str):
self.data[x][y] = re.sub('\[.*\]','', self.data[x][y])
self.data[x][y] = re.sub('\(.*\)','', self.data[x][y])
self.data[x][y] = re.sub(',','', self.data[x][y])
# Convert numerical strings to floats
if isinstance(self.data[x][y], str):
self.data[x][y] = self.data[x][y].strip()
if any(i.isdigit() for i in self.data[x][y]):
self.data[x][y] = ''.join([i for i in self.data[x][y] if i.isdigit() or i=='.' or i=='-'])
# Convert To Float
if isfloat(self.data[x][y]):
self.data[x][y] = float(self.data[x][y])
# Apply Variable Multiplyer
self.data[x][y] = self.data[x][y] * self.info['multiplyer'][y]
# Convert Whole Floats To Integers
if self.data[x][y].is_integer():
self.data[x][y] = int(self.data[x][y])
# Convert non-entries to null
if isinstance(self.data[x][y], str):
if( self.data[x][y].lower().find('not determined') >= 0 or
self.data[x][y].lower().find('negligible') >=0 or
self.data[x][y].lower().find('negligible') >=0 or
self.data[x][y].lower().find('unkown') >= 0 ):
self.data[x][y] = None
if( self.data[x][y] == '-' or
self.data[x][y] == '' ):
self.data[x][y] = None