In [82]:
import pandas as pd
import numpy as np
In [83]:
# Read the unclean data that was scraped
df = pd.read_csv("carData.csv")
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51783 entries, 0 to 51782
Data columns (total 13 columns):
price             50987 non-null object
miles             42301 non-null object
fuel_type         51655 non-null object
exterior_color    50885 non-null object
interior_color    45102 non-null object
drivetrain        48653 non-null object
transmission      49856 non-null object
engine            49340 non-null object
VIN               51526 non-null object
name              51657 non-null object
sellerAddress     51517 non-null object
id                51783 non-null int64
modelName         51783 non-null object
dtypes: int64(1), object(12)
memory usage: 5.1+ MB
In [84]:
# Replace empty cells with NaN to prepare for removing rows
df['price'].replace('', np.nan, inplace=True)
df['miles'].replace('', np.nan, inplace=True)
df['sellerAddress'].replace('', np.nan, inplace=True)
df['drivetrain'].replace('', np.nan, inplace=True)
df['transmission'].replace('', np.nan, inplace=True)
df['engine'].replace('', np.nan, inplace=True)

# Remove rows with empty entries in columns deemed important
df.dropna(subset=['price'], inplace=True)
df.dropna(subset=['miles'], inplace=True)
df.dropna(subset=['sellerAddress'], inplace=True)
df.dropna(subset=['drivetrain'], inplace=True)
df.dropna(subset=['transmission'], inplace=True)
df.dropna(subset=['engine'], inplace=True)

# Reset indexes so info is accurate
df.reset_index(drop=True, inplace=True)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39626 entries, 0 to 39625
Data columns (total 13 columns):
price             39626 non-null object
miles             39626 non-null object
fuel_type         39626 non-null object
exterior_color    39275 non-null object
interior_color    35182 non-null object
drivetrain        39626 non-null object
transmission      39626 non-null object
engine            39626 non-null object
VIN               39624 non-null object
name              39626 non-null object
sellerAddress     39626 non-null object
id                39626 non-null int64
modelName         39626 non-null object
dtypes: int64(1), object(12)
memory usage: 3.9+ MB
In [85]:
# Remove unit from "miles" column and convert column to number
df['miles'] = df['miles'].str.replace(" miles", "")
df['miles'] = df['miles'].str.replace(",", "")
df['miles'] = pd.to_numeric(df['miles'])
In [86]:
# Remove unit from "price" column and convert column to number
df['price'] = df['price'].str.replace("$", "")
df['price'] = df['price'].str.replace(",", "")
df['price'] = pd.to_numeric(df['price'])
In [87]:
# Standardize values in the "transmission" column

# Start by looking at CVT many CVT columns contain "Automatic"
df.loc[df['transmission'].str.contains('CVT', case=False), 'transmission'] = 'CVT'
df.loc[df['transmission'].str.contains('Variable', case=False), 'transmission'] = 'CVT'

# Find automatic variations
df.loc[df['transmission'].str.contains('Automatic', case=False), 'transmission']= 'Auto'
df.loc[df['transmission'].str.contains('Auto', case=False), 'transmission']= 'Auto'
df.loc[df['transmission'].str.strip() == "A", 'transmission'] = 'Auto'
df.loc[df['transmission'].str.strip() == "a", 'transmission'] = 'Auto'
df.loc[df['transmission'].str.contains('A/T', case=False), 'transmission'] = 'Auto'
df.loc[df['transmission'].str.contains('AT', case=False), 'transmission'] = 'Auto'

# Find manual variation assuming all "#-speed" that don't specify "automatic" are manual
df.loc[df['transmission'].str.contains('Manual', case=False), 'transmission']= 'Manual'
df.loc[df['transmission'].str.strip() == "m", 'transmission'] = 'Manual'
df.loc[df['transmission'].str.contains('Speed', case=False), 'transmission']= 'Manual'
df.loc[df['transmission'].str.contains('Dual Shift', case=False), 'transmission']= 'Manual'

# Now that we've mapped as far as possible remove all rows we couldn't map
df = df[df['transmission'].isin(['CVT', 'Auto', 'Manual'])]

df.reset_index(drop=True, inplace=True)
In [88]:
# Standardize the "drivetrain column"

# Find variations of front wheel drive
df.loc[df['drivetrain'].str.contains('Front', case=False), 'drivetrain'] = 'FWD'
df.loc[df['drivetrain'].str.contains('FWD', case=False), 'drivetrain'] = 'FWD'
df.loc[df['drivetrain'].str.contains('4x2', case=False), 'drivetrain'] = 'FWD'
df.loc[df['drivetrain'].str.contains('2WD', case=False), 'drivetrain'] = 'FWD'

# Find variations of 4 wheel drive assuming 4x4 and all wheel are identical
df.loc[df['drivetrain'].str.contains('Four', case=False), 'drivetrain'] = '4WD'
df.loc[df['drivetrain'].str.contains('4', case=False), 'drivetrain'] = '4WD'
df.loc[df['drivetrain'].str.contains('All', case=False), 'drivetrain'] = '4WD'

# Find variations of rear wheel drive
df.loc[df['drivetrain'].str.contains('RWD', case=False), 'drivetrain'] = 'RWD'
df.loc[df['drivetrain'].str.contains('Rear', case=False), 'drivetrain'] = 'RWD'

# Now that we've mapped as far as possible remove all rows we couldn't map
df = df[df['drivetrain'].isin(['FWD', '4WD', 'RWD'])]

df.reset_index(drop=True, inplace=True)
In [89]:
df['diesel'] = df['engine'].str.contains('Diesel', case=False)
df['turbo'] = df['engine'].str.contains('turbo|super', case=False)
df['hybrid'] = df['engine'].str.contains('hybrid', case=False)

df.loc[df['engine'].str.contains('12 Cylinder', case=False), 'engine'] = 'V12'

df.loc[df['engine'].str.contains('10 Cylinder', case=False), 'engine'] = 'V10'
df.loc[df['engine'].str.contains('10 Cyl', case=False), 'engine'] = 'V10'
df.loc[df['engine'].str.contains('V10', case=False), 'engine'] = 'V10'
df.loc[df['engine'].str.contains('V-10', case=False), 'engine'] = 'V10'

df.loc[df['engine'].str.contains('V8', case=False), 'engine'] = 'V8'
df.loc[df['engine'].str.contains('8 Cylinder', case=False), 'engine'] = 'V8'
df.loc[df['engine'].str.contains('8 Cyl', case=False), 'engine'] = 'V8'
df.loc[df['engine'].str.contains('8-Cyl', case=False), 'engine'] = 'V8'
df.loc[df['engine'].str.contains('V-8', case=False), 'engine'] = 'V8'

df.loc[df['engine'].str.contains('V6', case=False), 'engine'] = 'V6'
df.loc[df['engine'].str.contains('V-6', case=False), 'engine'] = 'V6'
df.loc[df['engine'].str.contains('I6', case=False), 'engine'] = 'V6'
df.loc[df['engine'].str.contains('L6', case=False), 'engine'] = 'V6'
df.loc[df['engine'].str.contains('I-6', case=False), 'engine'] = 'V6'
df.loc[df['engine'].str.contains('6-Cyl', case=False), 'engine'] = 'V6'
df.loc[df['engine'].str.contains('6 Cylinder', case=False), 'engine'] = 'V6'
df.loc[df['engine'].str.contains('6 Cyl', case=False), 'engine'] = 'V6'
df.loc[df['engine'].str.contains('6Cyl', case=False), 'engine'] = 'V6'

df.loc[df['engine'].str.contains('5 Cylinder', case=False), 'engine'] = 'V5'
df.loc[df['engine'].str.contains('5 Cyl', case=False), 'engine'] = 'V5'
df.loc[df['engine'].str.contains('I5', case=False), 'engine'] = 'V5'

df.loc[df['engine'].str.contains('V4', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('4-Cyl', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('4 Cyl', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('4Cyl', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('I4', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('I-4', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('L4', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('V-4', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('2.5L', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('2.0L', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('1.5L', case=False), 'engine'] = 'V4'
df.loc[df['engine'].str.contains('4 Cylinder', case=False), 'engine'] = 'V4'

df.loc[df['engine'].str.contains('3 Cyl', case=False), 'engine'] = 'V3'
df.loc[df['engine'].str.contains('I3', case=False), 'engine'] = 'V3'

df = df[df['engine'].isin(['V12', 'V10', 'V8', 'V6', 'V5', 'V4', 'V3'])]

df.reset_index(drop=True, inplace=True)
In [90]:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33293 entries, 0 to 33292
Data columns (total 16 columns):
price             33293 non-null int64
miles             33293 non-null int64
fuel_type         33293 non-null object
exterior_color    33030 non-null object
interior_color    29629 non-null object
drivetrain        33293 non-null object
transmission      33293 non-null object
engine            33293 non-null object
VIN               33291 non-null object
name              33293 non-null object
sellerAddress     33293 non-null object
id                33293 non-null int64
modelName         33293 non-null object
diesel            33293 non-null bool
turbo             33293 non-null bool
hybrid            33293 non-null bool
dtypes: bool(3), int64(3), object(10)
memory usage: 3.4+ MB
In [91]:
# Split car name into it's componenets (new/used) year make
splitNameDf = df['name'].str.split(' ', expand=True, n=3)
df['used'] = splitNameDf[0] != 'New'
df['year'] = splitNameDf[1]
df['make'] = splitNameDf[2]
In [92]:
# Some non Ford mixed in while scraping, remove those
df = df[df['make'] == "Ford"]
In [93]:
df.to_csv("carDataClean.csv", encoding='utf-8', index=False)