{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "7683d99c", "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "execution": { "iopub.execute_input": "2022-02-24T17:43:50.738416Z", "iopub.status.busy": "2022-02-24T17:43:50.737726Z", "iopub.status.idle": "2022-02-24T17:43:50.743226Z", "shell.execute_reply": "2022-02-24T17:43:50.743950Z", "shell.execute_reply.started": "2022-02-24T17:42:56.851323Z" }, "papermill": { "duration": 0.031549, "end_time": "2022-02-24T17:43:50.744310", "exception": false, "start_time": "2022-02-24T17:43:50.712761", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/kaggle/input/titanic/train.csv\n", "/kaggle/input/titanic/test.csv\n", "/kaggle/input/titanic/gender_submission.csv\n" ] } ], "source": [ "# This Python 3 environment comes with many helpful analytics libraries installed\n", "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", "# For example, here's several helpful packages to load\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "# Input data files are available in the read-only \"../input/\" directory\n", "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", "\n", "import os\n", "for dirname, _, filenames in os.walk('/kaggle/input'):\n", " for filename in filenames:\n", " print(os.path.join(dirname, filename))\n", "\n", "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n", "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" ] }, { "cell_type": "code", "execution_count": 2, "id": "89ace29c", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:50.774853Z", "iopub.status.busy": "2022-02-24T17:43:50.774143Z", "iopub.status.idle": "2022-02-24T17:43:50.812110Z", "shell.execute_reply": "2022-02-24T17:43:50.812592Z", "shell.execute_reply.started": "2022-02-24T17:42:56.862741Z" }, "papermill": { "duration": 0.054029, "end_time": "2022-02-24T17:43:50.812756", "exception": false, "start_time": "2022-02-24T17:43:50.758727", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Getting the training data ready\n", "train_data = pd.read_csv(\"/kaggle/input/titanic/train.csv\")\n", "train_data.head()" ] }, { "cell_type": "code", "execution_count": 3, "id": "6a2f141f", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:50.849879Z", "iopub.status.busy": "2022-02-24T17:43:50.849230Z", "iopub.status.idle": "2022-02-24T17:43:50.877985Z", "shell.execute_reply": "2022-02-24T17:43:50.878558Z", "shell.execute_reply.started": "2022-02-24T17:42:56.892959Z" }, "papermill": { "duration": 0.04977, "end_time": "2022-02-24T17:43:50.878753", "exception": false, "start_time": "2022-02-24T17:43:50.828983", "status": "completed" }, "tags": [] }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
\n", "
" ], "text/plain": [ " PassengerId Pclass Name Sex \\\n", "0 892 3 Kelly, Mr. James male \n", "1 893 3 Wilkes, Mrs. James (Ellen Needs) female \n", "2 894 2 Myles, Mr. Thomas Francis male \n", "3 895 3 Wirz, Mr. Albert male \n", "4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female \n", "\n", " Age SibSp Parch Ticket Fare Cabin Embarked \n", "0 34.5 0 0 330911 7.8292 NaN Q \n", "1 47.0 1 0 363272 7.0000 NaN S \n", "2 62.0 0 0 240276 9.6875 NaN Q \n", "3 27.0 0 0 315154 8.6625 NaN S \n", "4 22.0 1 1 3101298 12.2875 NaN S " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Getting the testing data ready\n", "test_data = pd.read_csv(\"/kaggle/input/titanic/test.csv\")\n", "test_data.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "703d2216", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:50.920564Z", "iopub.status.busy": "2022-02-24T17:43:50.919717Z", "iopub.status.idle": "2022-02-24T17:43:50.935174Z", "shell.execute_reply": "2022-02-24T17:43:50.934521Z", "shell.execute_reply.started": "2022-02-24T17:42:56.916674Z" }, "papermill": { "duration": 0.038399, "end_time": "2022-02-24T17:43:50.935331", "exception": false, "start_time": "2022-02-24T17:43:50.896932", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Combining the data sets\n", "all_data = train_data.append(test_data)" ] }, { "cell_type": "code", "execution_count": 5, "id": "ece0d422", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:50.978878Z", "iopub.status.busy": "2022-02-24T17:43:50.971153Z", "iopub.status.idle": "2022-02-24T17:43:50.982512Z", "shell.execute_reply": "2022-02-24T17:43:50.983048Z", "shell.execute_reply.started": "2022-02-24T17:42:56.930805Z" }, "papermill": { "duration": 0.033935, "end_time": "2022-02-24T17:43:50.983228", "exception": false, "start_time": "2022-02-24T17:43:50.949293", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "The number of missing values in Pclass column: 0\n", "The number of missing values in Name column: 0\n", "The number of missing values in Age column: 263\n", "The number of missing values in SibSp column: 0\n", "The number of missing values in Parch column: 0\n", "The number of missing values in Ticket column: 0\n", "The number of missing values in Fare column: 1\n", "The number of missing values in Cabin column: 1014\n", "The number of missing values in Embarked column: 2\n" ] } ], "source": [ "# Exploratory Anlaysis\n", "# Number of missing values\n", "print(\"The number of missing values in Pclass column: \"+str(all_data[\"Pclass\"].isna().sum()))\n", "print(\"The number of missing values in Name column: \"+str(all_data[\"Name\"].isna().sum()))\n", "print(\"The number of missing values in Age column: \"+str(all_data[\"Age\"].isna().sum()))\n", "print(\"The number of missing values in SibSp column: \"+str(all_data[\"SibSp\"].isna().sum()))\n", "print(\"The number of missing values in Parch column: \"+str(all_data[\"Parch\"].isna().sum()))\n", "print(\"The number of missing values in Ticket column: \"+str(all_data[\"Ticket\"].isna().sum()))\n", "print(\"The number of missing values in Fare column: \"+str(all_data[\"Fare\"].isna().sum()))\n", "print(\"The number of missing values in Cabin column: \"+str(all_data[\"Cabin\"].isna().sum()))\n", "print(\"The number of missing values in Embarked column: \"+str(all_data[\"Embarked\"].isna().sum()))" ] }, { "cell_type": "code", "execution_count": 6, "id": "084c20f3", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:51.018432Z", "iopub.status.busy": "2022-02-24T17:43:51.017533Z", "iopub.status.idle": "2022-02-24T17:43:51.020112Z", "shell.execute_reply": "2022-02-24T17:43:51.020569Z", "shell.execute_reply.started": "2022-02-24T17:42:56.954715Z" }, "papermill": { "duration": 0.02337, "end_time": "2022-02-24T17:43:51.020743", "exception": false, "start_time": "2022-02-24T17:43:50.997373", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Filling the missing ages with a median value\n", "train_data['Age'].fillna(train_data['Age'].median(), inplace=True)\n", "test_data['Age'].fillna(test_data['Age'].median(), inplace =True)" ] }, { "cell_type": "code", "execution_count": 7, "id": "28ab31ea", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:51.065337Z", "iopub.status.busy": "2022-02-24T17:43:51.064499Z", "iopub.status.idle": "2022-02-24T17:43:51.076855Z", "shell.execute_reply": "2022-02-24T17:43:51.076356Z", "shell.execute_reply.started": "2022-02-24T17:42:56.970791Z" }, "papermill": { "duration": 0.04212, "end_time": "2022-02-24T17:43:51.077056", "exception": false, "start_time": "2022-02-24T17:43:51.034936", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/conda/lib/python3.7/site-packages/pandas/core/indexing.py:1732: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " self._setitem_single_block(indexer, value, name)\n" ] } ], "source": [ "# Creating a Is_Married column\n", "train_data['Title'] = train_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]\n", "train_data['Is_Married'] = 0\n", "train_data['Is_Married'].loc[train_data['Title'] == 'Mrs'] = 1\n", "\n", "test_data['Title'] = test_data['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]\n", "test_data['Is_Married'] = 0\n", "test_data['Is_Married'].loc[test_data['Title'] == 'Mrs'] = 1" ] }, { "cell_type": "code", "execution_count": 8, "id": "cce613ad", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:51.114090Z", "iopub.status.busy": "2022-02-24T17:43:51.113406Z", "iopub.status.idle": "2022-02-24T17:43:51.116136Z", "shell.execute_reply": "2022-02-24T17:43:51.115482Z", "shell.execute_reply.started": "2022-02-24T17:43:14.059488Z" }, "papermill": { "duration": 0.024483, "end_time": "2022-02-24T17:43:51.116307", "exception": false, "start_time": "2022-02-24T17:43:51.091824", "status": "completed" }, "tags": [] }, "outputs": [], "source": [ "# Famiy Size Colation - since there is correlation between them\n", "train_data[\"Family_Size\"] = train_data['SibSp']+train_data['Parch']\n", "test_data[\"Family_Size\"] = test_data['SibSp']+test_data['Parch']" ] }, { "cell_type": "code", "execution_count": 9, "id": "e20c94f1", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:51.153198Z", "iopub.status.busy": "2022-02-24T17:43:51.152407Z", "iopub.status.idle": "2022-02-24T17:43:52.537648Z", "shell.execute_reply": "2022-02-24T17:43:52.538173Z", "shell.execute_reply.started": "2022-02-24T17:42:57.006490Z" }, "papermill": { "duration": 1.407363, "end_time": "2022-02-24T17:43:52.538358", "exception": false, "start_time": "2022-02-24T17:43:51.130995", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1\n", " 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0\n", " 1 0 0 1 0 1 1 0 0 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0\n", " 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0\n", " 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1\n", " 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0\n", " 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1\n", " 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0\n", " 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0\n", " 1 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0\n", " 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0\n", " 0 1 1 1 1 1 0 1 0 0 0]\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "y = train_data[\"Survived\"]\n", "\n", "features = [\"Pclass\",\"Sex\",\"Family_Size\",\"Age\",\"Is_Married\"]\n", "X = pd.get_dummies(train_data[features])\n", "X_test = pd.get_dummies(test_data[features])\n", "\n", "model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)\n", "\n", "model.fit(X, y)\n", "predictions = model.predict(X_test)\n", "\n", "print(predictions)\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "9fb38675", "metadata": { "execution": { "iopub.execute_input": "2022-02-24T17:43:52.575237Z", "iopub.status.busy": "2022-02-24T17:43:52.574558Z", "iopub.status.idle": "2022-02-24T17:43:52.580883Z", "shell.execute_reply": "2022-02-24T17:43:52.581383Z", "shell.execute_reply.started": "2022-02-24T17:42:57.253906Z" }, "papermill": { "duration": 0.027986, "end_time": "2022-02-24T17:43:52.581554", "exception": false, "start_time": "2022-02-24T17:43:52.553568", "status": "completed" }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Your submission was successfully saved!\n" ] } ], "source": [ "output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})\n", "output.to_csv('submission.csv', index=False)\n", "print(\"Your submission was successfully saved!\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3238b4d8", "metadata": { "papermill": { "duration": 0.015041, "end_time": "2022-02-24T17:43:52.612170", "exception": false, "start_time": "2022-02-24T17:43:52.597129", "status": "completed" }, "tags": [] }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "papermill": { "default_parameters": {}, "duration": 12.032025, "end_time": "2022-02-24T17:43:53.337607", "environment_variables": {}, "exception": null, "input_path": "__notebook__.ipynb", "output_path": "__notebook__.ipynb", "parameters": {}, "start_time": "2022-02-24T17:43:41.305582", "version": "2.3.3" } }, "nbformat": 4, "nbformat_minor": 5 }