In [1]:
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
import matplotlib.pyplot as plt
import statsmodels.api as sm
In [2]:
# Import the cleaned car data
df = pd.read_csv("carDataClean.csv")
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32596 entries, 0 to 32595
Data columns (total 19 columns):
price             32596 non-null int64
miles             32596 non-null int64
fuel_type         32596 non-null object
exterior_color    32357 non-null object
interior_color    28987 non-null object
drivetrain        32596 non-null object
transmission      32596 non-null object
engine            32596 non-null object
VIN               32594 non-null object
name              32596 non-null object
sellerAddress     32596 non-null object
id                32596 non-null int64
modelName         32596 non-null object
diesel            32596 non-null bool
turbo             32596 non-null bool
hybrid            32596 non-null bool
used              32596 non-null bool
year              32596 non-null int64
make              32596 non-null object
dtypes: bool(4), int64(4), object(11)
memory usage: 3.9+ MB
In [3]:
# Convert catagorial data into numeric catagorial data
# This would be better done with one hot encoding

drivetrain_map = {'FWD' : 0, 'RWD' : 1, '4WD' : 2}
drivetrain_numeric = [drivetrain_map[dt] for dt in df['drivetrain']]

tranmission_map = {'CVT' : 0, 'Auto': 1, "Manual": 2}
transmission_numeric = [tranmission_map[tr] for tr in df['transmission']]

engine_map = {'V12' : 12, 'V10': 10, 'V8': 8, 'V6': 6, 'V5': 5, 'V4': 4, 'V3': 3}
engine_numeric = [engine_map[en] for en in df['engine']]

diesel_numeric = [1 if d else 0 for d in df['diesel']]

turbo_numeric = [1 if d else 0 for d in df['turbo']]

hybrid_numeric = [1 if d else 0 for d in df['hybrid']]

# These numbers and car models come from back when the data was scraped
model_map = {
    "C-Max Energi":50243, "C-Max Hybrid":49085, "Crown Victoria":20906, "E150":21050,
    "E250":26506, "E350":26502, "E350 Super Duty":26507, "EcoSport":36284899,
    "Edge":21039, "Escape":21088, "Excursion":21102, "Expedition":21104,
    "Expedition EL":21085, "Expedition Max":36324071, "Explorer":21105, "Explorer Sport Trac":21107,
    "F-150":21095, "F-250":21115, "F-350":21097, "Fiesta":21146, "Five Hundred":21156, "Flex":21136,
    "Focus":21138, "Focus ST":48704, "Freestar":21169, "Freestyle":21144, "Fusion":21175, "Fusion Energi":53027,
    "Fusion Hybrid":27661, "Mustang":21712, "Probe":21752, "Ranger":21874, "Sedan Police Interceptor":57387,
    "Shelby GT350":30021281, "Taurus":22164, "Thunderbird":22263, "Transit Connect":28203,
    "Transit-150":56747, "Transit-250":56748, "Transit-350":56749, "Model Unknown":29629
}
model_numeric = [model_map[md] for md in df['modelName']]
In [4]:
# Build the data points price will be predicted from. Some data points, such as the VIN or seller address, are not going to be
# related to the price of the vehicle.

X = df.loc[:,['year', 'miles']]
X['model'] = model_numeric
X['transmission'] = transmission_numeric
X['engine'] = engine_numeric
X['turbo'] = turbo_numeric
X['diesel'] = diesel_numeric
X['hybrid'] = hybrid_numeric
X['drivetrain'] = drivetrain_numeric

Y = df['price']

# Create 80-20 test split for training models and testing
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.2)
In [5]:
# K-Neighbors Regresssion
kneighbors_reg = KNeighborsRegressor().fit(X_Train, Y_Train)

# Produce predictions using model
kneighbors_predict = kneighbors_reg.predict(X_Test)

# Find the scores of the regression
kneighbors_r2_score = kneighbors_reg.score(X_Test, Y_Test)
kneighbors_ma_error = model_selection.cross_val_score(kneighbors_reg, X_Test, Y_Test, scoring='neg_mean_absolute_error')
kneighbors_ms_error = model_selection.cross_val_score(kneighbors_reg, X_Test, Y_Test, scoring='neg_mean_squared_error')

# Graph predictions vs. actual
kneighbors_df = X_Test.copy()
kneighbors_df['model_price'] = kneighbors_predict
kneighbors_df['actual_price'] = df['price']
plt.title("Mileage vs. Price")
plt.scatter(kneighbors_df['miles'], kneighbors_df['model_price'], s=2, c='red')
plt.scatter(kneighbors_df['miles'], kneighbors_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Year vs. Price")
plt.scatter(kneighbors_df['year'], kneighbors_df['model_price'], s=2, c='red')
plt.scatter(kneighbors_df['year'], kneighbors_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Drivetrain vs. Price")
plt.scatter(kneighbors_df['drivetrain'], kneighbors_df['model_price'], s=2, c='red')
plt.scatter(kneighbors_df['drivetrain'], kneighbors_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Engine vs. Price")
plt.scatter(kneighbors_df['engine'], kneighbors_df['model_price'], s=2, c='red')
plt.scatter(kneighbors_df['engine'], kneighbors_df['actual_price'], s=2, c='blue')
plt.show()

print('KNeighbors regression: R^2 Score: ' + str(kneighbors_r2_score))
print('KNeighbors regression: Mean Absolute Error: ' + str(kneighbors_ma_error.mean()))
print('KNeighbors regression: Mean Squared Error: ' + str(kneighbors_ms_error.mean()))
print('KNeighbors regression: Root Mean Squared Error: ' + str(math.sqrt(abs(kneighbors_ms_error.mean()))))
KNeighbors regression: R^2 Score: 0.7320421153868665
KNeighbors regression: Mean Absolute Error: -6441.900351079069
KNeighbors regression: Mean Squared Error: -88419518.75518951
KNeighbors regression: Root Mean Squared Error: 9403.165358281727
In [6]:
# Lasso Regresssion

lasso_reg = LassoLarsCV(cv=3, max_n_alphas = 10).fit(X_Train, Y_Train)

# Produce predictions using model
lasso_reg_predict = lasso_reg.predict(X_Test)

# Find the scores of the regression
lasso_r2_score = lasso_reg.score(X_Test, Y_Test)
lasso_ma_error = model_selection.cross_val_score(lasso_reg, X_Test, Y_Test, scoring='neg_mean_absolute_error')
lasso_ms_error = model_selection.cross_val_score(lasso_reg, X_Test, Y_Test, scoring='neg_mean_squared_error')

# Graph predictions vs. actual
lasso_reg_df = X_Test.copy()
lasso_reg_df['model_price'] = lasso_reg_predict
lasso_reg_df['actual_price'] = df['price']
plt.title("Mileage vs. Price")
plt.scatter(lasso_reg_df['miles'], lasso_reg_df['model_price'], s=2, c='red')
plt.scatter(lasso_reg_df['miles'], lasso_reg_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Year vs. Price")
plt.scatter(lasso_reg_df['year'], lasso_reg_df['model_price'], s=2, c='red')
plt.scatter(lasso_reg_df['year'], lasso_reg_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Drivetrain vs. Price")
plt.scatter(lasso_reg_df['drivetrain'], lasso_reg_df['model_price'], s=2, c='red')
plt.scatter(lasso_reg_df['drivetrain'], lasso_reg_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Engine vs. Price")
plt.scatter(lasso_reg_df['engine'], lasso_reg_df['model_price'], s=2, c='red')
plt.scatter(lasso_reg_df['engine'], lasso_reg_df['actual_price'], s=2, c='blue')
plt.show()

print('Lasso regression: R^2 Score: ' + str(lasso_r2_score))
print('Lasso regression: Mean Absolute Error: ' + str(lasso_ma_error.mean()))
print('Lasso regression: Mean Squared Error: ' + str(lasso_ms_error.mean()))
print('Lasso regression: Root Mean Squared Error: ' + str(math.sqrt(abs(lasso_ms_error.mean()))))
Lasso regression: R^2 Score: 0.750355790089242
Lasso regression: Mean Absolute Error: -5756.794462643506
Lasso regression: Mean Squared Error: -63295198.11914897
Lasso regression: Root Mean Squared Error: 7955.827934234687
In [7]:
# Random Forest Regresssion

rf_reg = RandomForestRegressor().fit(X_Train, Y_Train)

# Produce predictions using model
rf_predict = rf_reg.predict(X_Test)

# Find the scores of the regression
rf_r2_score = rf_reg.score(X_Test, Y_Test)
rf_ma_error = model_selection.cross_val_score(rf_reg, X_Test, Y_Test, scoring='neg_mean_absolute_error')
rf_ms_error = model_selection.cross_val_score(rf_reg, X_Test, Y_Test, scoring='neg_mean_squared_error')

# Graph predictions vs. actual
rf_df = X_Test.copy()
rf_df['model_price'] = rf_predict
rf_df['actual_price'] = df['price']
plt.title("Mileage vs. Price")
plt.scatter(rf_df['miles'], rf_df['model_price'], s=2, c='red')
plt.scatter(rf_df['miles'], rf_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Year vs. Price")
plt.scatter(rf_df['year'], rf_df['model_price'], s=2, c='red')
plt.scatter(rf_df['year'], rf_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Drivetrain vs. Price")
plt.scatter(rf_df['drivetrain'], rf_df['model_price'], s=2, c='red')
plt.scatter(rf_df['drivetrain'], rf_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Engine vs. Price")
plt.scatter(rf_df['engine'], rf_df['model_price'], s=2, c='red')
plt.scatter(rf_df['engine'], rf_df['actual_price'], s=2, c='blue')
plt.show()

print('Random Forest regression: R^2 Score: ' + str(rf_r2_score))
print('Random Forest regression: Mean Absolute Error: ' + str(rf_ma_error.mean()))
print('Random Forest regression: Mean Squared Error: ' + str(rf_ms_error.mean()))
print('Random Forest regression: Root Mean Squared Error: ' + str(math.sqrt(abs(rf_ms_error.mean()))))
Random Forest regression: R^2 Score: 0.9042176074344574
Random Forest regression: Mean Absolute Error: -3556.6838061280246
Random Forest regression: Mean Squared Error: -29367242.076881412
Random Forest regression: Root Mean Squared Error: 5419.155107291303
In [8]:
# Linear Regresssion

linear_reg = LinearRegression().fit(X_Train, Y_Train)

# Produce predictions using model
linear_predict = linear_reg.predict(X_Test)

# Find the scores of the regression
linear_r2_score = linear_reg.score(X_Test, Y_Test)
linear_ma_error = model_selection.cross_val_score(linear_reg, X_Test, Y_Test, scoring='neg_mean_absolute_error')
linear_ms_error = model_selection.cross_val_score(linear_reg, X_Test, Y_Test, scoring='neg_mean_squared_error')

# Graph predictions vs. actual
linear_df = X_Test.copy()
linear_df['model_price'] = linear_predict
linear_df['actual_price'] = df['price']
plt.title("Mileage vs. Price")
plt.scatter(linear_df['miles'], linear_df['model_price'], s=2, c='red')
plt.scatter(linear_df['miles'], linear_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Year vs. Price")
plt.scatter(linear_df['year'], linear_df['model_price'], s=2, c='red')
plt.scatter(linear_df['year'], linear_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Drivetrain vs. Price")
plt.scatter(linear_df['drivetrain'], linear_df['model_price'], s=2, c='red')
plt.scatter(linear_df['drivetrain'], linear_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Engine vs. Price")
plt.scatter(linear_df['engine'], linear_df['model_price'], s=2, c='red')
plt.scatter(linear_df['engine'], linear_df['actual_price'], s=2, c='blue')
plt.show()

print('Linear regression: R^2 Score: ' + str(linear_r2_score))
print('Linear regression: Mean Absolute Error: ' + str(linear_ma_error.mean()))
print('Linear regression: Mean Squared Error: ' + str(linear_ma_error.mean()))
print('Linear regression: Root Mean Squared Error: ' + str(math.sqrt(abs(linear_ms_error.mean()))))
Linear regression: R^2 Score: 0.7503557900892426
Linear regression: Mean Absolute Error: -5756.7944626435055
Linear regression: Mean Squared Error: -5756.7944626435055
Linear regression: Root Mean Squared Error: 7955.82793423468
In [9]:
r2_scores = [kneighbors_r2_score, lasso_r2_score, rf_r2_score, linear_r2_score]

plt.title('R^2 Score of Models')
plt.bar(['K-Neighbors', 'Lasso', 'Rand. Forest', 'Linear'], r2_scores, .5, 
        color='#0066cc', edgecolor="#001a33", linewidth=4, alpha=0.7)
plt.show()
In [10]:
r2_scores = [abs(kneighbors_ms_error.mean()), abs(lasso_ms_error.mean()), abs(rf_ms_error.mean()), abs(linear_ms_error.mean())]

plt.title('Mean Squared Error of Models')
plt.bar(['K-Neighbors', 'Lasso', 'Rand. Forest', 'Linear'], r2_scores, .5, 
        color='#009900', edgecolor="#003300", linewidth=4, alpha=0.7)
plt.show()
In [11]:
r2_scores = [abs(kneighbors_ma_error.mean()), abs(lasso_ma_error.mean()), abs(rf_ma_error.mean()), abs(linear_ma_error.mean())]

plt.title('Mean Absolute Error of Models')
plt.bar(['K-Neighbors', 'Lasso', 'Rand. Forest', 'Linear'], r2_scores, .5, 
        color='#ff9933', edgecolor="#4d2600", linewidth=4, alpha=0.7)
plt.show()
In [12]:
r2_scores = [abs(math.sqrt(abs(kneighbors_ms_error.mean()))),abs(math.sqrt(abs(lasso_ms_error.mean()))),abs(math.sqrt(abs(rf_ms_error.mean()))),abs(math.sqrt(abs(linear_ms_error.mean()))),]

plt.title('Root Mean Squared Error of Models')
plt.bar(['K-Neighbors', 'Lasso', 'Rand. Forest', 'Linear'], r2_scores, .5, 
        color='#ff3932', edgecolor="#4d0000", linewidth=4, alpha=0.7)
plt.show()
In [13]:
# Simple user interface for manual testing of model

year = float(input('What year?: '))
miles = float(input('How many miles?: '))
model = float(model_map[input('What model?: ')])
trans = float(tranmission_map[input('What transmission type (CVT/Auto/Manual?: ')])
engine = float(engine_map[input('What engine size?: ')])
turbo = float(input('Does it have a turbo (0:false/1:true)?: '))
diesel = float(input('Is it diesel (0:false/1:true)?: '))
hybrid = float(input('Is it hybrid (0:false/1:true)?: '))
drivetrain = float(drivetrain_map[input('What drivetrain (FWD/RWD/4WD)?: ')])

predicted_price = rf_reg.predict([[year, miles, model, trans, engine, turbo, diesel, hybrid, drivetrain]])
print('The predicted price of this car is: ' + str(predicted_price[0]))
What year?: 2011
How many miles?: 126000
What model?: Fusion
What transmission type (CVT/Auto/Manual?: Auto
What engine size?: V6
Does it have a turbo (0:false/1:true)?: 0
Is it diesel (0:false/1:true)?: 0
Is it hybrid (0:false/1:true)?: 0
What drivetrain (FWD/RWD/4WD)?: FWD
The predicted price of this car is: 8700.1