import pandas as pd
cdf = pd.read_csv('train.csv')
cdf = cdf[:5000]
cdf = cdf.drop(columns = ['Unnamed: 0', 'id'], axis = 1)
print(cdf.head(1))
print(cdf.satisfaction.value_counts())
cdf['satisfaction'].value_counts().plot(kind='bar')
print(cdf.info())
cdf = cdf.dropna()
print(cdf.info())
import matplotlib.pyplot as plt
y = cdf.satisfaction
x = cdf.drop('satisfaction', axis = 1)
print('x shape', x.shape)
print('y shape', y.shape)
x_gd = pd.get_dummies(x, columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class'], drop_first=False)
print(x_gd.info())
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_gd, y, stratify = y, test_size = 0.2, random_state = 2023)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
le_y_train = le.transform(y_train)
le_y_test = le.transform(y_test)
print('data after labelling', le_y_train)
print(le_y_train.shape)
print('classes', le.classes_)
print('decoding', le.inverse_transform(le_y_train))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators = 50, max_features = None, n_jobs = -1, max_depth = 10)
rfc.fit(x_train, le_y_train)
acc_train = rfc.score(x_train, le_y_train)
acc_test = rfc.score(x_test, le_y_test)
print('acc', acc_train, acc_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
y_pred = rfc.predict(x_test)
cm = confusion_matrix(le_y_test, y_pred, labels = [0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [0, 1])
disp.plot()
print(classification_report(le_y_test, y_pred))
plt.show()
import pandas as pd
cdf = pd.read_csv('train.csv')
cdf = cdf[:5000]
cdf = cdf.drop(columns = ['Unnamed: 0', 'id'], axis = 1)
print(cdf.head(1))
print(cdf.satisfaction.value_counts())
cdf['satisfaction'].value_counts().plot(kind='bar')
print(cdf.info())
cdf = cdf.dropna()
print(cdf.info())
import matplotlib.pyplot as plt
y = cdf.satisfaction
x = cdf.drop('satisfaction', axis = 1)
print('x shape', x.shape)
print('y shape', y.shape)
x_gd = pd.get_dummies(x, columns = ['Gender', 'Customer Type', 'Type of Travel', 'Class'], drop_first=False)
print(x_gd.info())
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_gd, y, stratify = y, test_size = 0.2, random_state = 2023)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
le_y_train = le.transform(y_train)
le_y_test = le.transform(y_test)
print('data after labelling', le_y_train)
print(le_y_train.shape)
print('classes', le.classes_)
print('decoding', le.inverse_transform(le_y_train))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 1, n_estimators = 50, max_features = None, n_jobs = -1, max_depth = 10)
rfc.fit(x_train, le_y_train)
acc_train = rfc.score(x_train, le_y_train)
acc_test = rfc.score(x_test, le_y_test)
print('acc', acc_train, acc_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
y_pred = rfc.predict(x_test)
cm = confusion_matrix(le_y_test, y_pred, labels = [0, 1])
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [0, 1])
disp.plot()
print(classification_report(le_y_test, y_pred))
plt.show()
댓글