import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn import model_selection
import matplotlib.pyplot as plt
import statsmodels.api as sm
# Import the cleaned car data
df = pd.read_csv("carDataClean.csv")
df.info()
# Convert catagorial data into numeric catagorial data
# This would be better done with one hot encoding
drivetrain_map = {'FWD' : 0, 'RWD' : 1, '4WD' : 2}
drivetrain_numeric = [drivetrain_map[dt] for dt in df['drivetrain']]
tranmission_map = {'CVT' : 0, 'Auto': 1, "Manual": 2}
transmission_numeric = [tranmission_map[tr] for tr in df['transmission']]
engine_map = {'V12' : 12, 'V10': 10, 'V8': 8, 'V6': 6, 'V5': 5, 'V4': 4, 'V3': 3}
engine_numeric = [engine_map[en] for en in df['engine']]
diesel_numeric = [1 if d else 0 for d in df['diesel']]
turbo_numeric = [1 if d else 0 for d in df['turbo']]
hybrid_numeric = [1 if d else 0 for d in df['hybrid']]
# These numbers and car models come from back when the data was scraped
model_map = {
"C-Max Energi":50243, "C-Max Hybrid":49085, "Crown Victoria":20906, "E150":21050,
"E250":26506, "E350":26502, "E350 Super Duty":26507, "EcoSport":36284899,
"Edge":21039, "Escape":21088, "Excursion":21102, "Expedition":21104,
"Expedition EL":21085, "Expedition Max":36324071, "Explorer":21105, "Explorer Sport Trac":21107,
"F-150":21095, "F-250":21115, "F-350":21097, "Fiesta":21146, "Five Hundred":21156, "Flex":21136,
"Focus":21138, "Focus ST":48704, "Freestar":21169, "Freestyle":21144, "Fusion":21175, "Fusion Energi":53027,
"Fusion Hybrid":27661, "Mustang":21712, "Probe":21752, "Ranger":21874, "Sedan Police Interceptor":57387,
"Shelby GT350":30021281, "Taurus":22164, "Thunderbird":22263, "Transit Connect":28203,
"Transit-150":56747, "Transit-250":56748, "Transit-350":56749, "Model Unknown":29629
}
model_numeric = [model_map[md] for md in df['modelName']]
# Build the data points price will be predicted from. Some data points, such as the VIN or seller address, are not going to be
# related to the price of the vehicle.
X = df.loc[:,['year', 'miles']]
X['model'] = model_numeric
X['transmission'] = transmission_numeric
X['engine'] = engine_numeric
X['turbo'] = turbo_numeric
X['diesel'] = diesel_numeric
X['hybrid'] = hybrid_numeric
X['drivetrain'] = drivetrain_numeric
Y = df['price']
# Create 80-20 test split for training models and testing
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size=0.2)
# K-Neighbors Regresssion
kneighbors_reg = KNeighborsRegressor().fit(X_Train, Y_Train)
# Produce predictions using model
kneighbors_predict = kneighbors_reg.predict(X_Test)
# Find the scores of the regression
kneighbors_r2_score = kneighbors_reg.score(X_Test, Y_Test)
kneighbors_ma_error = model_selection.cross_val_score(kneighbors_reg, X_Test, Y_Test, scoring='neg_mean_absolute_error')
kneighbors_ms_error = model_selection.cross_val_score(kneighbors_reg, X_Test, Y_Test, scoring='neg_mean_squared_error')
# Graph predictions vs. actual
kneighbors_df = X_Test.copy()
kneighbors_df['model_price'] = kneighbors_predict
kneighbors_df['actual_price'] = df['price']
plt.title("Mileage vs. Price")
plt.scatter(kneighbors_df['miles'], kneighbors_df['model_price'], s=2, c='red')
plt.scatter(kneighbors_df['miles'], kneighbors_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Year vs. Price")
plt.scatter(kneighbors_df['year'], kneighbors_df['model_price'], s=2, c='red')
plt.scatter(kneighbors_df['year'], kneighbors_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Drivetrain vs. Price")
plt.scatter(kneighbors_df['drivetrain'], kneighbors_df['model_price'], s=2, c='red')
plt.scatter(kneighbors_df['drivetrain'], kneighbors_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Engine vs. Price")
plt.scatter(kneighbors_df['engine'], kneighbors_df['model_price'], s=2, c='red')
plt.scatter(kneighbors_df['engine'], kneighbors_df['actual_price'], s=2, c='blue')
plt.show()
print('KNeighbors regression: R^2 Score: ' + str(kneighbors_r2_score))
print('KNeighbors regression: Mean Absolute Error: ' + str(kneighbors_ma_error.mean()))
print('KNeighbors regression: Mean Squared Error: ' + str(kneighbors_ms_error.mean()))
print('KNeighbors regression: Root Mean Squared Error: ' + str(math.sqrt(abs(kneighbors_ms_error.mean()))))
# Lasso Regresssion
lasso_reg = LassoLarsCV(cv=3, max_n_alphas = 10).fit(X_Train, Y_Train)
# Produce predictions using model
lasso_reg_predict = lasso_reg.predict(X_Test)
# Find the scores of the regression
lasso_r2_score = lasso_reg.score(X_Test, Y_Test)
lasso_ma_error = model_selection.cross_val_score(lasso_reg, X_Test, Y_Test, scoring='neg_mean_absolute_error')
lasso_ms_error = model_selection.cross_val_score(lasso_reg, X_Test, Y_Test, scoring='neg_mean_squared_error')
# Graph predictions vs. actual
lasso_reg_df = X_Test.copy()
lasso_reg_df['model_price'] = lasso_reg_predict
lasso_reg_df['actual_price'] = df['price']
plt.title("Mileage vs. Price")
plt.scatter(lasso_reg_df['miles'], lasso_reg_df['model_price'], s=2, c='red')
plt.scatter(lasso_reg_df['miles'], lasso_reg_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Year vs. Price")
plt.scatter(lasso_reg_df['year'], lasso_reg_df['model_price'], s=2, c='red')
plt.scatter(lasso_reg_df['year'], lasso_reg_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Drivetrain vs. Price")
plt.scatter(lasso_reg_df['drivetrain'], lasso_reg_df['model_price'], s=2, c='red')
plt.scatter(lasso_reg_df['drivetrain'], lasso_reg_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Engine vs. Price")
plt.scatter(lasso_reg_df['engine'], lasso_reg_df['model_price'], s=2, c='red')
plt.scatter(lasso_reg_df['engine'], lasso_reg_df['actual_price'], s=2, c='blue')
plt.show()
print('Lasso regression: R^2 Score: ' + str(lasso_r2_score))
print('Lasso regression: Mean Absolute Error: ' + str(lasso_ma_error.mean()))
print('Lasso regression: Mean Squared Error: ' + str(lasso_ms_error.mean()))
print('Lasso regression: Root Mean Squared Error: ' + str(math.sqrt(abs(lasso_ms_error.mean()))))
# Random Forest Regresssion
rf_reg = RandomForestRegressor().fit(X_Train, Y_Train)
# Produce predictions using model
rf_predict = rf_reg.predict(X_Test)
# Find the scores of the regression
rf_r2_score = rf_reg.score(X_Test, Y_Test)
rf_ma_error = model_selection.cross_val_score(rf_reg, X_Test, Y_Test, scoring='neg_mean_absolute_error')
rf_ms_error = model_selection.cross_val_score(rf_reg, X_Test, Y_Test, scoring='neg_mean_squared_error')
# Graph predictions vs. actual
rf_df = X_Test.copy()
rf_df['model_price'] = rf_predict
rf_df['actual_price'] = df['price']
plt.title("Mileage vs. Price")
plt.scatter(rf_df['miles'], rf_df['model_price'], s=2, c='red')
plt.scatter(rf_df['miles'], rf_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Year vs. Price")
plt.scatter(rf_df['year'], rf_df['model_price'], s=2, c='red')
plt.scatter(rf_df['year'], rf_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Drivetrain vs. Price")
plt.scatter(rf_df['drivetrain'], rf_df['model_price'], s=2, c='red')
plt.scatter(rf_df['drivetrain'], rf_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Engine vs. Price")
plt.scatter(rf_df['engine'], rf_df['model_price'], s=2, c='red')
plt.scatter(rf_df['engine'], rf_df['actual_price'], s=2, c='blue')
plt.show()
print('Random Forest regression: R^2 Score: ' + str(rf_r2_score))
print('Random Forest regression: Mean Absolute Error: ' + str(rf_ma_error.mean()))
print('Random Forest regression: Mean Squared Error: ' + str(rf_ms_error.mean()))
print('Random Forest regression: Root Mean Squared Error: ' + str(math.sqrt(abs(rf_ms_error.mean()))))
# Linear Regresssion
linear_reg = LinearRegression().fit(X_Train, Y_Train)
# Produce predictions using model
linear_predict = linear_reg.predict(X_Test)
# Find the scores of the regression
linear_r2_score = linear_reg.score(X_Test, Y_Test)
linear_ma_error = model_selection.cross_val_score(linear_reg, X_Test, Y_Test, scoring='neg_mean_absolute_error')
linear_ms_error = model_selection.cross_val_score(linear_reg, X_Test, Y_Test, scoring='neg_mean_squared_error')
# Graph predictions vs. actual
linear_df = X_Test.copy()
linear_df['model_price'] = linear_predict
linear_df['actual_price'] = df['price']
plt.title("Mileage vs. Price")
plt.scatter(linear_df['miles'], linear_df['model_price'], s=2, c='red')
plt.scatter(linear_df['miles'], linear_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Year vs. Price")
plt.scatter(linear_df['year'], linear_df['model_price'], s=2, c='red')
plt.scatter(linear_df['year'], linear_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Drivetrain vs. Price")
plt.scatter(linear_df['drivetrain'], linear_df['model_price'], s=2, c='red')
plt.scatter(linear_df['drivetrain'], linear_df['actual_price'], s=2, c='blue')
plt.show()
plt.title("Engine vs. Price")
plt.scatter(linear_df['engine'], linear_df['model_price'], s=2, c='red')
plt.scatter(linear_df['engine'], linear_df['actual_price'], s=2, c='blue')
plt.show()
print('Linear regression: R^2 Score: ' + str(linear_r2_score))
print('Linear regression: Mean Absolute Error: ' + str(linear_ma_error.mean()))
print('Linear regression: Mean Squared Error: ' + str(linear_ma_error.mean()))
print('Linear regression: Root Mean Squared Error: ' + str(math.sqrt(abs(linear_ms_error.mean()))))
r2_scores = [kneighbors_r2_score, lasso_r2_score, rf_r2_score, linear_r2_score]
plt.title('R^2 Score of Models')
plt.bar(['K-Neighbors', 'Lasso', 'Rand. Forest', 'Linear'], r2_scores, .5,
color='#0066cc', edgecolor="#001a33", linewidth=4, alpha=0.7)
plt.show()
r2_scores = [abs(kneighbors_ms_error.mean()), abs(lasso_ms_error.mean()), abs(rf_ms_error.mean()), abs(linear_ms_error.mean())]
plt.title('Mean Squared Error of Models')
plt.bar(['K-Neighbors', 'Lasso', 'Rand. Forest', 'Linear'], r2_scores, .5,
color='#009900', edgecolor="#003300", linewidth=4, alpha=0.7)
plt.show()
r2_scores = [abs(kneighbors_ma_error.mean()), abs(lasso_ma_error.mean()), abs(rf_ma_error.mean()), abs(linear_ma_error.mean())]
plt.title('Mean Absolute Error of Models')
plt.bar(['K-Neighbors', 'Lasso', 'Rand. Forest', 'Linear'], r2_scores, .5,
color='#ff9933', edgecolor="#4d2600", linewidth=4, alpha=0.7)
plt.show()
r2_scores = [abs(math.sqrt(abs(kneighbors_ms_error.mean()))),abs(math.sqrt(abs(lasso_ms_error.mean()))),abs(math.sqrt(abs(rf_ms_error.mean()))),abs(math.sqrt(abs(linear_ms_error.mean()))),]
plt.title('Root Mean Squared Error of Models')
plt.bar(['K-Neighbors', 'Lasso', 'Rand. Forest', 'Linear'], r2_scores, .5,
color='#ff3932', edgecolor="#4d0000", linewidth=4, alpha=0.7)
plt.show()
# Simple user interface for manual testing of model
year = float(input('What year?: '))
miles = float(input('How many miles?: '))
model = float(model_map[input('What model?: ')])
trans = float(tranmission_map[input('What transmission type (CVT/Auto/Manual?: ')])
engine = float(engine_map[input('What engine size?: ')])
turbo = float(input('Does it have a turbo (0:false/1:true)?: '))
diesel = float(input('Is it diesel (0:false/1:true)?: '))
hybrid = float(input('Is it hybrid (0:false/1:true)?: '))
drivetrain = float(drivetrain_map[input('What drivetrain (FWD/RWD/4WD)?: ')])
predicted_price = rf_reg.predict([[year, miles, model, trans, engine, turbo, diesel, hybrid, drivetrain]])
print('The predicted price of this car is: ' + str(predicted_price[0]))