{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "7683d99c",
"metadata": {
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
"execution": {
"iopub.execute_input": "2022-02-24T17:43:50.738416Z",
"iopub.status.busy": "2022-02-24T17:43:50.737726Z",
"iopub.status.idle": "2022-02-24T17:43:50.743226Z",
"shell.execute_reply": "2022-02-24T17:43:50.743950Z",
"shell.execute_reply.started": "2022-02-24T17:42:56.851323Z"
},
"papermill": {
"duration": 0.031549,
"end_time": "2022-02-24T17:43:50.744310",
"exception": false,
"start_time": "2022-02-24T17:43:50.712761",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/kaggle/input/titanic/train.csv\n",
"/kaggle/input/titanic/test.csv\n",
"/kaggle/input/titanic/gender_submission.csv\n"
]
}
],
"source": [
"# This Python 3 environment comes with many helpful analytics libraries installed\n",
"# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n",
"# For example, here's several helpful packages to load\n",
"\n",
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"\n",
"# Input data files are available in the read-only \"../input/\" directory\n",
"# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
"\n",
"import os\n",
"for dirname, _, filenames in os.walk('/kaggle/input'):\n",
" for filename in filenames:\n",
" print(os.path.join(dirname, filename))\n",
"\n",
"# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n",
"# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "89ace29c",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:50.774853Z",
"iopub.status.busy": "2022-02-24T17:43:50.774143Z",
"iopub.status.idle": "2022-02-24T17:43:50.812110Z",
"shell.execute_reply": "2022-02-24T17:43:50.812592Z",
"shell.execute_reply.started": "2022-02-24T17:42:56.862741Z"
},
"papermill": {
"duration": 0.054029,
"end_time": "2022-02-24T17:43:50.812756",
"exception": false,
"start_time": "2022-02-24T17:43:50.758727",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Survived | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 3 | \n",
" Braund, Mr. Owen Harris | \n",
" male | \n",
" 22.0 | \n",
" 1 | \n",
" 0 | \n",
" A/5 21171 | \n",
" 7.2500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
" female | \n",
" 38.0 | \n",
" 1 | \n",
" 0 | \n",
" PC 17599 | \n",
" 71.2833 | \n",
" C85 | \n",
" C | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 1 | \n",
" 3 | \n",
" Heikkinen, Miss. Laina | \n",
" female | \n",
" 26.0 | \n",
" 0 | \n",
" 0 | \n",
" STON/O2. 3101282 | \n",
" 7.9250 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 1 | \n",
" 1 | \n",
" Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
" female | \n",
" 35.0 | \n",
" 1 | \n",
" 0 | \n",
" 113803 | \n",
" 53.1000 | \n",
" C123 | \n",
" S | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 0 | \n",
" 3 | \n",
" Allen, Mr. William Henry | \n",
" male | \n",
" 35.0 | \n",
" 0 | \n",
" 0 | \n",
" 373450 | \n",
" 8.0500 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Survived Pclass \\\n",
"0 1 0 3 \n",
"1 2 1 1 \n",
"2 3 1 3 \n",
"3 4 1 1 \n",
"4 5 0 3 \n",
"\n",
" Name Sex Age SibSp \\\n",
"0 Braund, Mr. Owen Harris male 22.0 1 \n",
"1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
"2 Heikkinen, Miss. Laina female 26.0 0 \n",
"3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
"4 Allen, Mr. William Henry male 35.0 0 \n",
"\n",
" Parch Ticket Fare Cabin Embarked \n",
"0 0 A/5 21171 7.2500 NaN S \n",
"1 0 PC 17599 71.2833 C85 C \n",
"2 0 STON/O2. 3101282 7.9250 NaN S \n",
"3 0 113803 53.1000 C123 S \n",
"4 0 373450 8.0500 NaN S "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Getting the training data ready\n",
"train_data = pd.read_csv(\"/kaggle/input/titanic/train.csv\")\n",
"train_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6a2f141f",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:50.849879Z",
"iopub.status.busy": "2022-02-24T17:43:50.849230Z",
"iopub.status.idle": "2022-02-24T17:43:50.877985Z",
"shell.execute_reply": "2022-02-24T17:43:50.878558Z",
"shell.execute_reply.started": "2022-02-24T17:42:56.892959Z"
},
"papermill": {
"duration": 0.04977,
"end_time": "2022-02-24T17:43:50.878753",
"exception": false,
"start_time": "2022-02-24T17:43:50.828983",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" PassengerId | \n",
" Pclass | \n",
" Name | \n",
" Sex | \n",
" Age | \n",
" SibSp | \n",
" Parch | \n",
" Ticket | \n",
" Fare | \n",
" Cabin | \n",
" Embarked | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 892 | \n",
" 3 | \n",
" Kelly, Mr. James | \n",
" male | \n",
" 34.5 | \n",
" 0 | \n",
" 0 | \n",
" 330911 | \n",
" 7.8292 | \n",
" NaN | \n",
" Q | \n",
"
\n",
" \n",
" 1 | \n",
" 893 | \n",
" 3 | \n",
" Wilkes, Mrs. James (Ellen Needs) | \n",
" female | \n",
" 47.0 | \n",
" 1 | \n",
" 0 | \n",
" 363272 | \n",
" 7.0000 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 2 | \n",
" 894 | \n",
" 2 | \n",
" Myles, Mr. Thomas Francis | \n",
" male | \n",
" 62.0 | \n",
" 0 | \n",
" 0 | \n",
" 240276 | \n",
" 9.6875 | \n",
" NaN | \n",
" Q | \n",
"
\n",
" \n",
" 3 | \n",
" 895 | \n",
" 3 | \n",
" Wirz, Mr. Albert | \n",
" male | \n",
" 27.0 | \n",
" 0 | \n",
" 0 | \n",
" 315154 | \n",
" 8.6625 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
" 4 | \n",
" 896 | \n",
" 3 | \n",
" Hirvonen, Mrs. Alexander (Helga E Lindqvist) | \n",
" female | \n",
" 22.0 | \n",
" 1 | \n",
" 1 | \n",
" 3101298 | \n",
" 12.2875 | \n",
" NaN | \n",
" S | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" PassengerId Pclass Name Sex \\\n",
"0 892 3 Kelly, Mr. James male \n",
"1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n",
"2 894 2 Myles, Mr. Thomas Francis male \n",
"3 895 3 Wirz, Mr. Albert male \n",
"4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n",
"\n",
" Age SibSp Parch Ticket Fare Cabin Embarked \n",
"0 34.5 0 0 330911 7.8292 NaN Q \n",
"1 47.0 1 0 363272 7.0000 NaN S \n",
"2 62.0 0 0 240276 9.6875 NaN Q \n",
"3 27.0 0 0 315154 8.6625 NaN S \n",
"4 22.0 1 1 3101298 12.2875 NaN S "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Getting the testing data ready\n",
"test_data = pd.read_csv(\"/kaggle/input/titanic/test.csv\")\n",
"test_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "703d2216",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:50.920564Z",
"iopub.status.busy": "2022-02-24T17:43:50.919717Z",
"iopub.status.idle": "2022-02-24T17:43:50.935174Z",
"shell.execute_reply": "2022-02-24T17:43:50.934521Z",
"shell.execute_reply.started": "2022-02-24T17:42:56.916674Z"
},
"papermill": {
"duration": 0.038399,
"end_time": "2022-02-24T17:43:50.935331",
"exception": false,
"start_time": "2022-02-24T17:43:50.896932",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"# Combining the data sets\n",
"all_data = train_data.append(test_data)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ece0d422",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:50.978878Z",
"iopub.status.busy": "2022-02-24T17:43:50.971153Z",
"iopub.status.idle": "2022-02-24T17:43:50.982512Z",
"shell.execute_reply": "2022-02-24T17:43:50.983048Z",
"shell.execute_reply.started": "2022-02-24T17:42:56.930805Z"
},
"papermill": {
"duration": 0.033935,
"end_time": "2022-02-24T17:43:50.983228",
"exception": false,
"start_time": "2022-02-24T17:43:50.949293",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The number of missing values in Pclass column: 0\n",
"The number of missing values in Name column: 0\n",
"The number of missing values in Age column: 263\n",
"The number of missing values in SibSp column: 0\n",
"The number of missing values in Parch column: 0\n",
"The number of missing values in Ticket column: 0\n",
"The number of missing values in Fare column: 1\n",
"The number of missing values in Cabin column: 1014\n",
"The number of missing values in Embarked column: 2\n"
]
}
],
"source": [
"# Exploratory Anlaysis\n",
"# Number of missing values\n",
"print(\"The number of missing values in Pclass column: \"+str(all_data[\"Pclass\"].isna().sum()))\n",
"print(\"The number of missing values in Name column: \"+str(all_data[\"Name\"].isna().sum()))\n",
"print(\"The number of missing values in Age column: \"+str(all_data[\"Age\"].isna().sum()))\n",
"print(\"The number of missing values in SibSp column: \"+str(all_data[\"SibSp\"].isna().sum()))\n",
"print(\"The number of missing values in Parch column: \"+str(all_data[\"Parch\"].isna().sum()))\n",
"print(\"The number of missing values in Ticket column: \"+str(all_data[\"Ticket\"].isna().sum()))\n",
"print(\"The number of missing values in Fare column: \"+str(all_data[\"Fare\"].isna().sum()))\n",
"print(\"The number of missing values in Cabin column: \"+str(all_data[\"Cabin\"].isna().sum()))\n",
"print(\"The number of missing values in Embarked column: \"+str(all_data[\"Embarked\"].isna().sum()))"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "084c20f3",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:51.018432Z",
"iopub.status.busy": "2022-02-24T17:43:51.017533Z",
"iopub.status.idle": "2022-02-24T17:43:51.020112Z",
"shell.execute_reply": "2022-02-24T17:43:51.020569Z",
"shell.execute_reply.started": "2022-02-24T17:42:56.954715Z"
},
"papermill": {
"duration": 0.02337,
"end_time": "2022-02-24T17:43:51.020743",
"exception": false,
"start_time": "2022-02-24T17:43:50.997373",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"# Filling the missing ages with a median value\n",
"train_data['Age'].fillna(train_data['Age'].median(), inplace=True)\n",
"test_data['Age'].fillna(test_data['Age'].median(), inplace =True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "28ab31ea",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:51.065337Z",
"iopub.status.busy": "2022-02-24T17:43:51.064499Z",
"iopub.status.idle": "2022-02-24T17:43:51.076855Z",
"shell.execute_reply": "2022-02-24T17:43:51.076356Z",
"shell.execute_reply.started": "2022-02-24T17:42:56.970791Z"
},
"papermill": {
"duration": 0.04212,
"end_time": "2022-02-24T17:43:51.077056",
"exception": false,
"start_time": "2022-02-24T17:43:51.034936",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py:1732: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self._setitem_single_block(indexer, value, name)\n"
]
}
],
"source": [
"# Creating a Is_Married column\n",
"train_data['Title'] = train_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]\n",
"train_data['Is_Married'] = 0\n",
"train_data['Is_Married'].loc[train_data['Title'] == 'Mrs'] = 1\n",
"\n",
"test_data['Title'] = test_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]\n",
"test_data['Is_Married'] = 0\n",
"test_data['Is_Married'].loc[test_data['Title'] == 'Mrs'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cce613ad",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:51.114090Z",
"iopub.status.busy": "2022-02-24T17:43:51.113406Z",
"iopub.status.idle": "2022-02-24T17:43:51.116136Z",
"shell.execute_reply": "2022-02-24T17:43:51.115482Z",
"shell.execute_reply.started": "2022-02-24T17:43:14.059488Z"
},
"papermill": {
"duration": 0.024483,
"end_time": "2022-02-24T17:43:51.116307",
"exception": false,
"start_time": "2022-02-24T17:43:51.091824",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": [
"# Famiy Size Colation - since there is correlation between them\n",
"train_data[\"Family_Size\"] = train_data['SibSp']+train_data['Parch']\n",
"test_data[\"Family_Size\"] = test_data['SibSp']+test_data['Parch']"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e20c94f1",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:51.153198Z",
"iopub.status.busy": "2022-02-24T17:43:51.152407Z",
"iopub.status.idle": "2022-02-24T17:43:52.537648Z",
"shell.execute_reply": "2022-02-24T17:43:52.538173Z",
"shell.execute_reply.started": "2022-02-24T17:42:57.006490Z"
},
"papermill": {
"duration": 1.407363,
"end_time": "2022-02-24T17:43:52.538358",
"exception": false,
"start_time": "2022-02-24T17:43:51.130995",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1\n",
" 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0\n",
" 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0\n",
" 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0\n",
" 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1\n",
" 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0\n",
" 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1\n",
" 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0\n",
" 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0\n",
" 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0\n",
" 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0\n",
" 0 1 1 1 1 1 0 1 0 0 0]\n"
]
}
],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"y = train_data[\"Survived\"]\n",
"\n",
"features = [\"Pclass\",\"Sex\",\"Family_Size\",\"Age\",\"Is_Married\"]\n",
"X = pd.get_dummies(train_data[features])\n",
"X_test = pd.get_dummies(test_data[features])\n",
"\n",
"model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n",
"\n",
"model.fit(X, y)\n",
"predictions = model.predict(X_test)\n",
"\n",
"print(predictions)\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "9fb38675",
"metadata": {
"execution": {
"iopub.execute_input": "2022-02-24T17:43:52.575237Z",
"iopub.status.busy": "2022-02-24T17:43:52.574558Z",
"iopub.status.idle": "2022-02-24T17:43:52.580883Z",
"shell.execute_reply": "2022-02-24T17:43:52.581383Z",
"shell.execute_reply.started": "2022-02-24T17:42:57.253906Z"
},
"papermill": {
"duration": 0.027986,
"end_time": "2022-02-24T17:43:52.581554",
"exception": false,
"start_time": "2022-02-24T17:43:52.553568",
"status": "completed"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Your submission was successfully saved!\n"
]
}
],
"source": [
"output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})\n",
"output.to_csv('submission.csv', index=False)\n",
"print(\"Your submission was successfully saved!\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3238b4d8",
"metadata": {
"papermill": {
"duration": 0.015041,
"end_time": "2022-02-24T17:43:52.612170",
"exception": false,
"start_time": "2022-02-24T17:43:52.597129",
"status": "completed"
},
"tags": []
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
},
"papermill": {
"default_parameters": {},
"duration": 12.032025,
"end_time": "2022-02-24T17:43:53.337607",
"environment_variables": {},
"exception": null,
"input_path": "__notebook__.ipynb",
"output_path": "__notebook__.ipynb",
"parameters": {},
"start_time": "2022-02-24T17:43:41.305582",
"version": "2.3.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}