forked from algorithmica-repository/datascience
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtitanic-v4(rf).py
49 lines (37 loc) · 1.64 KB
/
titanic-v4(rf).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import pandas as pd
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import model_selection
#returns current working directory
os.getcwd()
#changes working directory
os.chdir("C:\\Users\\Algorithmica\\Downloads")
titanic_train = pd.read_csv("train.csv")
#EDA
titanic_train.shape
titanic_train.info()
sum(titanic_train['Pclass'].isnull())
titanic_train.apply(lambda x : sum(x.isnull()))
titanic_train.Embarked[titanic_train['Embarked'].isnull()] = 'S'
titanic_train1 = titanic_train.copy()
le = preprocessing.LabelEncoder()
titanic_train1.Sex = le.fit_transform(titanic_train1.Sex)
titanic_train1.Embarked = le.fit_transform(titanic_train.Embarked)
titanic_train1.Pclass = le.fit_transform(titanic_train1.Pclass)
X_train = titanic_train1[['Sex','Embarked','Pclass','Fare']]
y_train = titanic_train1['Survived']
#oob scrore is computed as part of model construction process
rf_estimator = ensemble.RandomForestClassifier(n_estimators=200,oob_score=True, max_features = 4, random_state=10)
rf_estimator.fit(X_train,y_train)
rf_estimator.oob_score_
titanic_test = pd.read_csv("test.csv")
titanic_test.apply(lambda x : sum(x.isnull()))
titanic_test.Fare[titanic_test['Fare'].isnull()] = titanic_test['Fare'].mean()
titanic_test1 = titanic_test.copy()
titanic_test1.Sex = le.fit_transform(titanic_test1.Sex)
titanic_test1.Embarked = le.fit_transform(titanic_test1.Embarked)
titanic_test1.Pclass = le.fit_transform(titanic_test1.Pclass)
X_test = titanic_test1[['Sex','Embarked','Pclass','Fare']]
titanic_test1['Survived'] = rf_estimator.predict(X_test)
titanic_test1.to_csv("submission.csv", columns=['PassengerId','Survived'], index=False)