diff --git a/week 3/.ipynb_checkpoints/Week 3 - Pandas-checkpoint.ipynb b/week 3/.ipynb_checkpoints/Week 3 - Pandas-checkpoint.ipynb index 2fd6442..23ea750 100644 --- a/week 3/.ipynb_checkpoints/Week 3 - Pandas-checkpoint.ipynb +++ b/week 3/.ipynb_checkpoints/Week 3 - Pandas-checkpoint.ipynb @@ -1,6 +1,2410 @@ { - "cells": [], - "metadata": {}, + "cells": [ + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Week 3 - Pandas\n", + "Pandas, a library written by Wes McKinney, is a great tool for data manipulation and analysis. It provides two classes:\n", + "* a Series object, which handles a single column of data;\n", + "* a DataFrame object, which handles multiple columns (like an Excel spreadsheet).\n", + "\n", + "You can build your own DataFrames or read in from other sources like CSVs or JSON. Pandas handles missing data beautifully; lets you sort, operate on and and merge datasets; provides plotting capabilities; and handles time series data (among other advantages)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.1 Creating Series and DataFrames" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Cardiff\n", + "1 Swansea\n", + "2 Abergavenny\n", + "3 Machynlleth\n", + "dtype: object" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a series by passing a list\n", + "\n", + "towns = pd.Series(['Cardiff', 'Swansea', 'Abergavenny','Machynlleth'])\n", + "towns" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [], + "source": [ + "towns = ['Cardiff', 'Swansea', 'Abergavenny','Machynlleth']\n", + "populations = [335145, 230300, 12515, 2235]\n", + "number_of_pubs = [2100, 1680, 198, 48]" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "# Create an empty DataFrame, and add new columns to it\n", + "\n", + "towns_df = pd.DataFrame()\n", + "towns_df['name'] = towns\n", + "towns_df['population'] = populations" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "name object\n", + "population int64\n", + "dtype: object" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The columns have different dtypes\n", + "\n", + "towns_df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['name', 'population'], dtype='object')" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can access the column names list as so:\n", + "towns_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationn_pubs
0Cardiff3351452100
1Swansea2303001680
2Abergavenny12515198
3Machynlleth223548
\n", + "
" + ], + "text/plain": [ + " name population n_pubs\n", + "0 Cardiff 335145 2100\n", + "1 Swansea 230300 1680\n", + "2 Abergavenny 12515 198\n", + "3 Machynlleth 2235 48" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Create a DataFrame using dictionaries to specify column name and data\n", + "\n", + "towns_df = pd.DataFrame({'name': towns,\n", + " 'population': populations,\n", + " 'n_pubs': number_of_pubs})\n", + "\n", + "towns_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.2 View and select data" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationn_pubs
0Cardiff3351452100
1Swansea2303001680
\n", + "
" + ], + "text/plain": [ + " name population n_pubs\n", + "0 Cardiff 335145 2100\n", + "1 Swansea 230300 1680" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# the .head() method shows the top rows\n", + "\n", + "towns_df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(4, 3)" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# check how many rows and columns\n", + "towns_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 335145\n", + "1 230300\n", + "2 12515\n", + "3 2235\n", + "Name: population, dtype: int64" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Inspect only one series using square bracket notation\n", + "\n", + "towns_df['population']" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 335145\n", + "1 230300\n", + "2 12515\n", + "3 2235\n", + "Name: population, dtype: int64" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Or dot notation WARNING: not available in all circumstances, e.g. when defining a new column\n", + "\n", + "towns_df.population" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationn_pubs
0Cardiff3351452100
1Swansea2303001680
2Abergavenny12515198
\n", + "
" + ], + "text/plain": [ + " name population n_pubs\n", + "0 Cardiff 335145 2100\n", + "1 Swansea 230300 1680\n", + "2 Abergavenny 12515 198" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Standard Python indexing works in the row direction\n", + "\n", + "towns_df[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2100\n", + "1 1680\n", + "2 198\n", + "3 48\n", + "Name: n_pubs, dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# typically, column indexing should come first\n", + "towns_df['n_pubs']" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2100\n", + "1 1680\n", + "2 198\n", + "Name: n_pubs, dtype: int64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "towns_df['n_pubs'][:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.3 Select and manipulate data" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationn_pubs
2Abergavenny12515198
\n", + "
" + ], + "text/plain": [ + " name population n_pubs\n", + "2 Abergavenny 12515 198" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Use Boolean indexing to inspect values based on a condition\n", + "\n", + "towns_df.loc[towns_df.name == 'Abergavenny']" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a new column with math outputs\n", + "\n", + "towns_df['pubs_per_capita'] = towns_df.n_pubs / towns_df.population\n", + "towns_df['people_per_pub'] = towns_df.population / towns_df.n_pubs" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationn_pubspubs_per_capitapeople_per_pub
1Swansea23030016800.007295137.083333
2Abergavenny125151980.01582163.207071
3Machynlleth2235480.02147746.562500
\n", + "
" + ], + "text/plain": [ + " name population n_pubs pubs_per_capita people_per_pub\n", + "1 Swansea 230300 1680 0.007295 137.083333\n", + "2 Abergavenny 12515 198 0.015821 63.207071\n", + "3 Machynlleth 2235 48 0.021477 46.562500" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Use a single column's value to select data\n", + "\n", + "towns_df.loc[towns_df.people_per_pub < 150]" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [], + "source": [ + "# Use the .sort_values() method\n", + "# Helpful parameter: set inplace = True if you want to modify your df\n", + "\n", + "towns_df.sort_values(by = 'people_per_pub', inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "# Plot charts using the .plot() method \n", + "\n", + "towns_df.plot(x = 'name', y = 'pubs_per_capita', kind = 'bar', title = 'Some great towns to visit')" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\charl\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", + " after removing the cwd from sys.path.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationn_pubspubs_per_capitapeople_per_pubto_visit
3Machynlleth2235480.02147746.562500yes!
2Abergavenny125151980.01582163.207071no
1Swansea23030016800.007295137.083333no
0Cardiff33514521000.006266159.592857no
\n", + "
" + ], + "text/plain": [ + " name population n_pubs pubs_per_capita people_per_pub to_visit\n", + "3 Machynlleth 2235 48 0.021477 46.562500 yes!\n", + "2 Abergavenny 12515 198 0.015821 63.207071 no\n", + "1 Swansea 230300 1680 0.007295 137.083333 no\n", + "0 Cardiff 335145 2100 0.006266 159.592857 no" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Modify values\n", + "\n", + "towns_df['to_visit'] = 'no'\n", + "towns_df.to_visit[towns_df.people_per_pub <50] = 'yes!'\n", + "towns_df" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2100\n", + "Name: n_pubs, dtype: int64" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# More on .loc and .iloc\n", + "# .loc is a very flexible indexer. You can pass it pairs of (row, col) indexers to get a specific value:\n", + "towns_df.loc[towns_df['name'] == 'Cardiff', 'n_pubs']" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationn_pubspubs_per_capitapeople_per_pubto_visit
3Machynlleth2235480.02147746.562500yes!
2Abergavenny125151980.01582163.207071no
1Swansea23030016800.007295137.083333no
0Cardiff33514500.006266159.592857no
\n", + "
" + ], + "text/plain": [ + " name population n_pubs pubs_per_capita people_per_pub to_visit\n", + "3 Machynlleth 2235 48 0.021477 46.562500 yes!\n", + "2 Abergavenny 12515 198 0.015821 63.207071 no\n", + "1 Swansea 230300 1680 0.007295 137.083333 no\n", + "0 Cardiff 335145 0 0.006266 159.592857 no" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can also use this to set values:\n", + "towns_df.loc[towns_df['name'] == 'Cardiff', 'n_pubs'] = 0\n", + "towns_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "#.iloc is used to get the row by its index - the special column to the furthest left.\n", + "# It only works with integer indexers, unlike .loc" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "name Cardiff\n", + "population 335145\n", + "n_pubs 0\n", + "pubs_per_capita 0.00626594\n", + "people_per_pub 159.593\n", + "to_visit no\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "towns_df.iloc[3]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "# you can change this index column by setting a new one:" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationn_pubspubs_per_capitapeople_per_pubto_visit
name
Machynlleth2235480.02147746.562500yes!
Abergavenny125151980.01582163.207071no
Swansea23030016800.007295137.083333no
Cardiff33514500.006266159.592857no
\n", + "
" + ], + "text/plain": [ + " population n_pubs pubs_per_capita people_per_pub to_visit\n", + "name \n", + "Machynlleth 2235 48 0.021477 46.562500 yes!\n", + "Abergavenny 12515 198 0.015821 63.207071 no\n", + "Swansea 230300 1680 0.007295 137.083333 no\n", + "Cardiff 335145 0 0.006266 159.592857 no" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "towns_df = towns_df.set_index('name')\n", + "towns_df" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "Cannot index by location index with a non-integer key", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtowns_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Cardiff'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 1498\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1499\u001b[0m \u001b[0mmaybe_callable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1500\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmaybe_callable\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1501\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1502\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_is_scalar_access\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m 2224\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2225\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2226\u001b[1;33m raise TypeError(\"Cannot index by location index with a \"\n\u001b[0m\u001b[0;32m 2227\u001b[0m \"non-integer key\")\n\u001b[0;32m 2228\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mTypeError\u001b[0m: Cannot index by location index with a non-integer key" + ] + } + ], + "source": [ + "# this will fail as Cardiff is not an indexer\n", + "towns_df.iloc['Cardiff']" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "335145" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# reformatting the towns_df DataFrame to make 'name' the index allows me to make calls like this using loc:\n", + "towns_df.loc['Cardiff','population']" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "# indexes can be reset at any time:\n", + "towns_df = towns_df.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also select multiple columns at a time:" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulation
0Machynlleth2235
1Abergavenny12515
2Swansea230300
3Cardiff335145
\n", + "
" + ], + "text/plain": [ + " name population\n", + "0 Machynlleth 2235\n", + "1 Abergavenny 12515\n", + "2 Swansea 230300\n", + "3 Cardiff 335145" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "towns_df[['name','population']]" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=4, step=1)" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can see the index by calling it directly:\n", + "towns_df.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.2 Pandas II" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [], + "source": [ + "# You can join pandas dataframes together in many ways" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "df_A = pd.DataFrame({\n", + " 'name':towns,\n", + " 'population':populations\n", + "})\n", + "\n", + "df_B = pd.DataFrame({\n", + " 'name':towns,\n", + " 'pubs':number_of_pubs\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationnamepubs
0Cardiff335145Cardiff2100
1Swansea230300Swansea1680
2Abergavenny12515Abergavenny198
3Machynlleth2235Machynlleth48
\n", + "
" + ], + "text/plain": [ + " name population name pubs\n", + "0 Cardiff 335145 Cardiff 2100\n", + "1 Swansea 230300 Swansea 1680\n", + "2 Abergavenny 12515 Abergavenny 198\n", + "3 Machynlleth 2235 Machynlleth 48" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df_A, df_B], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\charl\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationpubs
0Cardiff335145.0NaN
1Swansea230300.0NaN
2Abergavenny12515.0NaN
3Machynlleth2235.0NaN
0CardiffNaN2100.0
1SwanseaNaN1680.0
2AbergavennyNaN198.0
3MachynllethNaN48.0
\n", + "
" + ], + "text/plain": [ + " name population pubs\n", + "0 Cardiff 335145.0 NaN\n", + "1 Swansea 230300.0 NaN\n", + "2 Abergavenny 12515.0 NaN\n", + "3 Machynlleth 2235.0 NaN\n", + "0 Cardiff NaN 2100.0\n", + "1 Swansea NaN 1680.0\n", + "2 Abergavenny NaN 198.0\n", + "3 Machynlleth NaN 48.0" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df_A, df_B], axis = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "# These don't look correct! (N.B. - but they would be if each DF contained the same columns). \n", + "# The way to get around this is to assign a common index that both frames share" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "df_A = df_A.set_index('name')" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "df_B = df_B.set_index('name')" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
population
name
Cardiff335145
Swansea230300
Abergavenny12515
Machynlleth2235
\n", + "
" + ], + "text/plain": [ + " population\n", + "name \n", + "Cardiff 335145\n", + "Swansea 230300\n", + "Abergavenny 12515\n", + "Machynlleth 2235" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_A" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pubs
name
Cardiff2100
Swansea1680
Abergavenny198
Machynlleth48
\n", + "
" + ], + "text/plain": [ + " pubs\n", + "name \n", + "Cardiff 2100\n", + "Swansea 1680\n", + "Abergavenny 198\n", + "Machynlleth 48" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_B" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "df_A['pubs'] = df_B['pubs']" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationpubs
name
Cardiff3351452100
Swansea2303001680
Abergavenny12515198
Machynlleth223548
\n", + "
" + ], + "text/plain": [ + " population pubs\n", + "name \n", + "Cardiff 335145 2100\n", + "Swansea 230300 1680\n", + "Abergavenny 12515 198\n", + "Machynlleth 2235 48" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_A" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationpubs
03351452100
12303001680
212515198
3223548
\n", + "
" + ], + "text/plain": [ + " population pubs\n", + "0 335145 2100\n", + "1 230300 1680\n", + "2 12515 198\n", + "3 2235 48" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can also do this same operation via '.merge', a method of DataFrames, should they have the same index\n", + "df_A.merge(df_B, how = 'inner')" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "# What if these dataFrames aren't the same size?" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "df_C = pd.DataFrame({\n", + " 'name':towns,\n", + " 'population':populations\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [], + "source": [ + "df_D = pd.DataFrame({'name':'Winchester','population':40005}, index = [4])" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulation
4Winchester40005
\n", + "
" + ], + "text/plain": [ + " name population\n", + "4 Winchester 40005" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_D" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "df_C = df_C.append(df_D)" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulation
0Cardiff335145
1Swansea230300
2Abergavenny12515
3Machynlleth2235
4Winchester40005
\n", + "
" + ], + "text/plain": [ + " name population\n", + "0 Cardiff 335145\n", + "1 Swansea 230300\n", + "2 Abergavenny 12515\n", + "3 Machynlleth 2235\n", + "4 Winchester 40005" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "df_C = df_C.set_index('name')" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
population
name
Cardiff335145
Swansea230300
Abergavenny12515
Machynlleth2235
Winchester40005
\n", + "
" + ], + "text/plain": [ + " population\n", + "name \n", + "Cardiff 335145\n", + "Swansea 230300\n", + "Abergavenny 12515\n", + "Machynlleth 2235\n", + "Winchester 40005" + ] + }, + "execution_count": 173, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pubs
name
Cardiff2100
Swansea1680
Abergavenny198
Machynlleth48
\n", + "
" + ], + "text/plain": [ + " pubs\n", + "name \n", + "Cardiff 2100\n", + "Swansea 1680\n", + "Abergavenny 198\n", + "Machynlleth 48" + ] + }, + "execution_count": 174, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_B" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [], + "source": [ + "df_C['pubs'] = df_B['pubs']" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationpubs
name
Cardiff3351452100.0
Swansea2303001680.0
Abergavenny12515198.0
Machynlleth223548.0
Winchester40005NaN
\n", + "
" + ], + "text/plain": [ + " population pubs\n", + "name \n", + "Cardiff 335145 2100.0\n", + "Swansea 230300 1680.0\n", + "Abergavenny 12515 198.0\n", + "Machynlleth 2235 48.0\n", + "Winchester 40005 NaN" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 177, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C['pubs'].loc['Winchester']" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "numpy.float64" + ] + }, + "execution_count": 178, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df_C['pubs'].loc['Winchester'])" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [], + "source": [ + "df_C['pubs'] = df_C['pubs'].fillna(850)" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationpubs
name
Cardiff3351452100.0
Swansea2303001680.0
Abergavenny12515198.0
Machynlleth223548.0
Winchester40005850.0
\n", + "
" + ], + "text/plain": [ + " population pubs\n", + "name \n", + "Cardiff 335145 2100.0\n", + "Swansea 230300 1680.0\n", + "Abergavenny 12515 198.0\n", + "Machynlleth 2235 48.0\n", + "Winchester 40005 850.0" + ] + }, + "execution_count": 181, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.1 Read data from files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataframe using read_csv()\n", + "# Here, we would use os.path.join() to \n", + "df = pd.read_csv('pluto_18v2_1.csv')\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[::20]\n", + "df.shape\n", + "df.to_csv('pluto_shortened.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('pluto_shortened.csv')\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2 Save data back to files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(os.path.join( [ your file location here!! ]) )" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + } + }, "nbformat": 4, "nbformat_minor": 2 } diff --git a/week 3/Week 3 - Pandas.ipynb b/week 3/Week 3 - Pandas.ipynb index 8b9e3a0..23ea750 100644 --- a/week 3/Week 3 - Pandas.ipynb +++ b/week 3/Week 3 - Pandas.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 8, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 54, "metadata": {}, "outputs": [ { @@ -44,7 +44,7 @@ "dtype: object" ] }, - "execution_count": 5, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -58,7 +58,7 @@ }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 57, "metadata": {}, "outputs": [ { @@ -93,7 +93,7 @@ "dtype: object" ] }, - "execution_count": 128, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -106,7 +106,28 @@ }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['name', 'population'], dtype='object')" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can access the column names list as so:\n", + "towns_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 59, "metadata": {}, "outputs": [ { @@ -172,7 +193,7 @@ "3 Machynlleth 2235 48" ] }, - "execution_count": 129, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -196,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 60, "metadata": {}, "outputs": [ { @@ -248,7 +269,7 @@ "1 Swansea 230300 1680" ] }, - "execution_count": 130, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -261,16 +282,16 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(4, 5)" + "(4, 3)" ] }, - "execution_count": 154, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -282,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 62, "metadata": {}, "outputs": [ { @@ -295,7 +316,7 @@ "Name: population, dtype: int64" ] }, - "execution_count": 131, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -308,7 +329,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -321,52 +342,20 @@ "Name: population, dtype: int64" ] }, - "execution_count": 132, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Or dot notation\n", + "# Or dot notation WARNING: not available in all circumstances, e.g. when defining a new column\n", "\n", "towns_df.population" ] }, { "cell_type": "code", - "execution_count": 133, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2100\n", - "1 1680\n", - "2 198\n", - "Name: n_pubs, dtype: int64" - ] - }, - "execution_count": 133, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Standard Python indexing works\n", - "\n", - "towns_df.n_pubs[:3]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 1.3 Select and manipulate data" - ] - }, - { - "cell_type": "code", - "execution_count": 134, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -397,6 +386,18 @@ " \n", " \n", " \n", + " 0\n", + " Cardiff\n", + " 335145\n", + " 2100\n", + " \n", + " \n", + " 1\n", + " Swansea\n", + " 230300\n", + " 1680\n", + " \n", + " \n", " 2\n", " Abergavenny\n", " 12515\n", @@ -408,23 +409,80 @@ ], "text/plain": [ " name population n_pubs\n", + "0 Cardiff 335145 2100\n", + "1 Swansea 230300 1680\n", "2 Abergavenny 12515 198" ] }, - "execution_count": 134, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Use Boolean indexing to inspect values based on a condition\n", + "# Standard Python indexing works in the row direction\n", "\n", - "towns_df[towns_df.name == 'Abergavenny']" + "towns_df[:3]" ] }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2100\n", + "1 1680\n", + "2 198\n", + "3 48\n", + "Name: n_pubs, dtype: int64" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# typically, column indexing should come first\n", + "towns_df['n_pubs']" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2100\n", + "1 1680\n", + "2 198\n", + "Name: n_pubs, dtype: int64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "towns_df['n_pubs'][:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.3 Select and manipulate data" + ] + }, + { + "cell_type": "code", + "execution_count": 67, "metadata": {}, "outputs": [ { @@ -451,72 +509,50 @@ " name\n", " population\n", " n_pubs\n", - " pubs_per_capita\n", - " people_per_pub\n", " \n", " \n", " \n", " \n", - " 0\n", - " Cardiff\n", - " 335145\n", - " 2100\n", - " 0.006266\n", - " 159.592857\n", - " \n", - " \n", - " 1\n", - " Swansea\n", - " 230300\n", - " 1680\n", - " 0.007295\n", - " 137.083333\n", - " \n", - " \n", " 2\n", " Abergavenny\n", " 12515\n", " 198\n", - " 0.015821\n", - " 63.207071\n", - " \n", - " \n", - " 3\n", - " Machynlleth\n", - " 2235\n", - " 48\n", - " 0.021477\n", - " 46.562500\n", " \n", " \n", "\n", "" ], "text/plain": [ - " name population n_pubs pubs_per_capita people_per_pub\n", - "0 Cardiff 335145 2100 0.006266 159.592857\n", - "1 Swansea 230300 1680 0.007295 137.083333\n", - "2 Abergavenny 12515 198 0.015821 63.207071\n", - "3 Machynlleth 2235 48 0.021477 46.562500" + " name population n_pubs\n", + "2 Abergavenny 12515 198" ] }, - "execution_count": 135, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "# Use Boolean indexing to inspect values based on a condition\n", + "\n", + "towns_df.loc[towns_df.name == 'Abergavenny']" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], "source": [ "# Create a new column with math outputs\n", "\n", "towns_df['pubs_per_capita'] = towns_df.n_pubs / towns_df.population\n", - "towns_df['people_per_pub'] = towns_df.population / towns_df.n_pubs\n", - "\n", - "towns_df" + "towns_df['people_per_pub'] = towns_df.population / towns_df.n_pubs" ] }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 69, "metadata": {}, "outputs": [ { @@ -583,7 +619,7 @@ "3 Machynlleth 2235 48 0.021477 46.562500" ] }, - "execution_count": 136, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } @@ -591,12 +627,12 @@ "source": [ "# Use a single column's value to select data\n", "\n", - "towns_df[towns_df.people_per_pub < 150]" + "towns_df.loc[towns_df.people_per_pub < 150]" ] }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -608,22 +644,22 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 151, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -642,14 +678,14 @@ }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 72, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/Users/nicholasjones/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: \n", + "C:\\Users\\charl\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", @@ -687,12 +723,21 @@ " \n", " \n", " \n", - " 0\n", - " Cardiff\n", - " 335145\n", - " 2100\n", - " 0.006266\n", - " 159.592857\n", + " 3\n", + " Machynlleth\n", + " 2235\n", + " 48\n", + " 0.021477\n", + " 46.562500\n", + " yes!\n", + " \n", + " \n", + " 2\n", + " Abergavenny\n", + " 12515\n", + " 198\n", + " 0.015821\n", + " 63.207071\n", " no\n", " \n", " \n", @@ -705,36 +750,27 @@ " no\n", " \n", " \n", - " 2\n", - " Abergavenny\n", - " 12515\n", - " 183\n", - " 0.014622\n", - " 68.387978\n", + " 0\n", + " Cardiff\n", + " 335145\n", + " 2100\n", + " 0.006266\n", + " 159.592857\n", " no\n", " \n", - " \n", - " 3\n", - " Machynlleth\n", - " 2235\n", - " 48\n", - " 0.021477\n", - " 46.562500\n", - " yes!\n", - " \n", " \n", "\n", "" ], "text/plain": [ " name population n_pubs pubs_per_capita people_per_pub to_visit\n", - "0 Cardiff 335145 2100 0.006266 159.592857 no\n", + "3 Machynlleth 2235 48 0.021477 46.562500 yes!\n", + "2 Abergavenny 12515 198 0.015821 63.207071 no\n", "1 Swansea 230300 1680 0.007295 137.083333 no\n", - "2 Abergavenny 12515 183 0.014622 68.387978 no\n", - "3 Machynlleth 2235 48 0.021477 46.562500 yes!" + "0 Cardiff 335145 2100 0.006266 159.592857 no" ] }, - "execution_count": 125, + "execution_count": 72, "metadata": {}, "output_type": "execute_result" } @@ -747,79 +783,32 @@ "towns_df" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.1 Read data from files" - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/nicholasjones/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (16,17,18,20,22,77) have mixed types. Specify dtype option on import or set low_memory=False.\n", - " interactivity=interactivity, compiler=compiler, result=result)\n" - ] - }, - { - "data": { - "text/plain": [ - "(858982, 96)" - ] - }, - "execution_count": 153, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Create a dataframe using read_csv()\n", - "\n", - "df = pd.read_csv('pluto_18v2_1.csv')\n", - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 158, - "metadata": {}, - "outputs": [], - "source": [ - "df = df[::20]\n", - "df.shape\n", - "df.to_csv('pluto_shortened.csv')" - ] - }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(108, 97)" + "0 2100\n", + "Name: n_pubs, dtype: int64" ] }, - "execution_count": 159, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv('pluto_shortened.csv')\n", - "df.shape" + "# More on .loc and .iloc\n", + "# .loc is a very flexible indexer. You can pass it pairs of (row, col) indexers to get a specific value:\n", + "towns_df.loc[towns_df['name'] == 'Cardiff', 'n_pubs']" ] }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 74, "metadata": {}, "outputs": [ { @@ -843,188 +832,1558 @@ " \n", " \n", " \n", - " Unnamed: 0\n", - " borough\n", - " block\n", - " lot\n", - " cd\n", - " ct2010\n", - " cb2010\n", - " schooldist\n", - " council\n", - " zipcode\n", - " ...\n", - " firm07_flag\n", - " pfirm15_flag\n", - " rpaddate\n", - " dcasdate\n", - " zoningdate\n", - " landmkdate\n", - " basempdate\n", - " masdate\n", - " polidate\n", - " edesigdate\n", + " name\n", + " population\n", + " n_pubs\n", + " pubs_per_capita\n", + " people_per_pub\n", + " to_visit\n", " \n", " \n", " \n", " \n", - " 0\n", - " 0\n", - " BX\n", - " 5641.0\n", - " 670.0\n", - " 210.0\n", - " 516.00\n", - " 2.0\n", - " NaN\n", - " 13.0\n", - " NaN\n", - " ...\n", - " 1.0\n", - " 1.0\n", - " 12/6/2018\n", - " 12/20/2018\n", - " 12/21/2018\n", - " 12/20/2018\n", - " 12/21/2018\n", - " NaN\n", - " NaN\n", - " 12/20/2018\n", - " \n", - " \n", - " 1\n", - " 8000\n", - " SI\n", - " 7864.0\n", - " 67.0\n", - " 503.0\n", - " 244.02\n", - " 1012.0\n", - " 31.0\n", - " 51.0\n", - " 10307.0\n", - " ...\n", - " NaN\n", - " NaN\n", - " 12/6/2018\n", - " 12/20/2018\n", - " 12/21/2018\n", - " 12/20/2018\n", - " 12/21/2018\n", + " 3\n", + " Machynlleth\n", + " 2235\n", + " 48\n", + " 0.021477\n", + " 46.562500\n", + " yes!\n", + " \n", + " \n", + " 2\n", + " Abergavenny\n", + " 12515\n", + " 198\n", + " 0.015821\n", + " 63.207071\n", + " no\n", + " \n", + " \n", + " 1\n", + " Swansea\n", + " 230300\n", + " 1680\n", + " 0.007295\n", + " 137.083333\n", + " no\n", + " \n", + " \n", + " 0\n", + " Cardiff\n", + " 335145\n", + " 0\n", + " 0.006266\n", + " 159.592857\n", + " no\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " name population n_pubs pubs_per_capita people_per_pub to_visit\n", + "3 Machynlleth 2235 48 0.021477 46.562500 yes!\n", + "2 Abergavenny 12515 198 0.015821 63.207071 no\n", + "1 Swansea 230300 1680 0.007295 137.083333 no\n", + "0 Cardiff 335145 0 0.006266 159.592857 no" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can also use this to set values:\n", + "towns_df.loc[towns_df['name'] == 'Cardiff', 'n_pubs'] = 0\n", + "towns_df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "#.iloc is used to get the row by its index - the special column to the furthest left.\n", + "# It only works with integer indexers, unlike .loc" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "name Cardiff\n", + "population 335145\n", + "n_pubs 0\n", + "pubs_per_capita 0.00626594\n", + "people_per_pub 159.593\n", + "to_visit no\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "towns_df.iloc[3]" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "# you can change this index column by setting a new one:" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationn_pubspubs_per_capitapeople_per_pubto_visit
name
Machynlleth2235480.02147746.562500yes!
Abergavenny125151980.01582163.207071no
Swansea23030016800.007295137.083333no
Cardiff33514500.006266159.592857no
\n", + "
" + ], + "text/plain": [ + " population n_pubs pubs_per_capita people_per_pub to_visit\n", + "name \n", + "Machynlleth 2235 48 0.021477 46.562500 yes!\n", + "Abergavenny 12515 198 0.015821 63.207071 no\n", + "Swansea 230300 1680 0.007295 137.083333 no\n", + "Cardiff 335145 0 0.006266 159.592857 no" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "towns_df = towns_df.set_index('name')\n", + "towns_df" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "ename": "TypeError", + "evalue": "Cannot index by location index with a non-integer key", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mtowns_df\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0miloc\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'Cardiff'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m__getitem__\u001b[1;34m(self, key)\u001b[0m\n\u001b[0;32m 1498\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1499\u001b[0m \u001b[0mmaybe_callable\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mcom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mapply_if_callable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mobj\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1500\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_getitem_axis\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmaybe_callable\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0maxis\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1501\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1502\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_is_scalar_access\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\core\\indexing.py\u001b[0m in \u001b[0;36m_getitem_axis\u001b[1;34m(self, key, axis)\u001b[0m\n\u001b[0;32m 2224\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2225\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mis_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2226\u001b[1;33m raise TypeError(\"Cannot index by location index with a \"\n\u001b[0m\u001b[0;32m 2227\u001b[0m \"non-integer key\")\n\u001b[0;32m 2228\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mTypeError\u001b[0m: Cannot index by location index with a non-integer key" + ] + } + ], + "source": [ + "# this will fail as Cardiff is not an indexer\n", + "towns_df.iloc['Cardiff']" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "335145" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# reformatting the towns_df DataFrame to make 'name' the index allows me to make calls like this using loc:\n", + "towns_df.loc['Cardiff','population']" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "# indexes can be reset at any time:\n", + "towns_df = towns_df.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "# You can also select multiple columns at a time:" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulation
0Machynlleth2235
1Abergavenny12515
2Swansea230300
3Cardiff335145
\n", + "
" + ], + "text/plain": [ + " name population\n", + "0 Machynlleth 2235\n", + "1 Abergavenny 12515\n", + "2 Swansea 230300\n", + "3 Cardiff 335145" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "towns_df[['name','population']]" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "RangeIndex(start=0, stop=4, step=1)" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# you can see the index by calling it directly:\n", + "towns_df.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 1.2 Pandas II" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [], + "source": [ + "# You can join pandas dataframes together in many ways" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "df_A = pd.DataFrame({\n", + " 'name':towns,\n", + " 'population':populations\n", + "})\n", + "\n", + "df_B = pd.DataFrame({\n", + " 'name':towns,\n", + " 'pubs':number_of_pubs\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationnamepubs
0Cardiff335145Cardiff2100
1Swansea230300Swansea1680
2Abergavenny12515Abergavenny198
3Machynlleth2235Machynlleth48
\n", + "
" + ], + "text/plain": [ + " name population name pubs\n", + "0 Cardiff 335145 Cardiff 2100\n", + "1 Swansea 230300 Swansea 1680\n", + "2 Abergavenny 12515 Abergavenny 198\n", + "3 Machynlleth 2235 Machynlleth 48" + ] + }, + "execution_count": 137, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df_A, df_B], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\charl\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulationpubs
0Cardiff335145.0NaN
1Swansea230300.0NaN
2Abergavenny12515.0NaN
3Machynlleth2235.0NaN
0CardiffNaN12/20/20182100.0
1SwanseaNaN1680.0
2AbergavennyNaN198.0
3MachynllethNaN48.0
\n", + "
" + ], + "text/plain": [ + " name population pubs\n", + "0 Cardiff 335145.0 NaN\n", + "1 Swansea 230300.0 NaN\n", + "2 Abergavenny 12515.0 NaN\n", + "3 Machynlleth 2235.0 NaN\n", + "0 Cardiff NaN 2100.0\n", + "1 Swansea NaN 1680.0\n", + "2 Abergavenny NaN 198.0\n", + "3 Machynlleth NaN 48.0" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df_A, df_B], axis = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "# These don't look correct! (N.B. - but they would be if each DF contained the same columns). \n", + "# The way to get around this is to assign a common index that both frames share" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "df_A = df_A.set_index('name')" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "df_B = df_B.set_index('name')" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
population
name
Cardiff335145
Swansea230300
Abergavenny12515
Machynlleth2235
\n", + "
" + ], + "text/plain": [ + " population\n", + "name \n", + "Cardiff 335145\n", + "Swansea 230300\n", + "Abergavenny 12515\n", + "Machynlleth 2235" + ] + }, + "execution_count": 142, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_A" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pubs
name
Cardiff2100
Swansea1680
Abergavenny198
Machynlleth48
\n", + "
" + ], + "text/plain": [ + " pubs\n", + "name \n", + "Cardiff 2100\n", + "Swansea 1680\n", + "Abergavenny 198\n", + "Machynlleth 48" + ] + }, + "execution_count": 143, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_B" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "df_A['pubs'] = df_B['pubs']" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationpubs
name
Cardiff3351452100
Swansea2303001680
Abergavenny12515198
Machynlleth223548
\n", + "
" + ], + "text/plain": [ + " population pubs\n", + "name \n", + "Cardiff 335145 2100\n", + "Swansea 230300 1680\n", + "Abergavenny 12515 198\n", + "Machynlleth 2235 48" + ] + }, + "execution_count": 145, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_A" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationpubs
03351452100
12303001680
212515198
3223548
\n", + "
" + ], + "text/plain": [ + " population pubs\n", + "0 335145 2100\n", + "1 230300 1680\n", + "2 12515 198\n", + "3 2235 48" + ] + }, + "execution_count": 146, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# You can also do this same operation via '.merge', a method of DataFrames, should they have the same index\n", + "df_A.merge(df_B, how = 'inner')" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "# What if these dataFrames aren't the same size?" + ] + }, + { + "cell_type": "code", + "execution_count": 167, + "metadata": {}, + "outputs": [], + "source": [ + "df_C = pd.DataFrame({\n", + " 'name':towns,\n", + " 'population':populations\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 168, + "metadata": {}, + "outputs": [], + "source": [ + "df_D = pd.DataFrame({'name':'Winchester','population':40005}, index = [4])" + ] + }, + { + "cell_type": "code", + "execution_count": 169, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulation
4Winchester40005
\n", + "
" + ], + "text/plain": [ + " name population\n", + "4 Winchester 40005" + ] + }, + "execution_count": 169, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_D" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "df_C = df_C.append(df_D)" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namepopulation
0Cardiff335145
1Swansea230300
2Abergavenny12515
3Machynlleth2235
4Winchester40005
\n", + "
" + ], + "text/plain": [ + " name population\n", + "0 Cardiff 335145\n", + "1 Swansea 230300\n", + "2 Abergavenny 12515\n", + "3 Machynlleth 2235\n", + "4 Winchester 40005" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [], + "source": [ + "df_C = df_C.set_index('name')" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
population
name
Cardiff335145
Swansea230300
Abergavenny12515
Machynlleth2235
Winchester40005
\n", + "
" + ], + "text/plain": [ + " population\n", + "name \n", + "Cardiff 335145\n", + "Swansea 230300\n", + "Abergavenny 12515\n", + "Machynlleth 2235\n", + "Winchester 40005" + ] + }, + "execution_count": 173, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C" + ] + }, + { + "cell_type": "code", + "execution_count": 174, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
pubs
216000QN11551.0110.0410.088.001009.027.032.0NaN...NaNNaN12/6/201812/20/201812/21/201812/20/201812/21/2018NaNNaN12/20/2018name
324000BK6094.014.0310.0152.002002.020.043.011228.0...NaNNaN12/6/201812/20/201812/21/201812/20/201812/21/2018NaNNaN12/20/2018Cardiff2100
432000QN7791.02.0411.01291.021001.026.023.011364.0...NaNNaN12/6/201812/20/201812/21/201812/20/201812/21/2018NaNNaN12/20/2018Swansea1680
Abergavenny198
Machynlleth48
\n", - "

5 rows × 97 columns

\n", "
" ], "text/plain": [ - " Unnamed: 0 borough block lot cd ct2010 cb2010 schooldist \\\n", - "0 0 BX 5641.0 670.0 210.0 516.00 2.0 NaN \n", - "1 8000 SI 7864.0 67.0 503.0 244.02 1012.0 31.0 \n", - "2 16000 QN 11551.0 110.0 410.0 88.00 1009.0 27.0 \n", - "3 24000 BK 6094.0 14.0 310.0 152.00 2002.0 20.0 \n", - "4 32000 QN 7791.0 2.0 411.0 1291.02 1001.0 26.0 \n", + " pubs\n", + "name \n", + "Cardiff 2100\n", + "Swansea 1680\n", + "Abergavenny 198\n", + "Machynlleth 48" + ] + }, + "execution_count": 174, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_B" + ] + }, + { + "cell_type": "code", + "execution_count": 175, + "metadata": {}, + "outputs": [], + "source": [ + "df_C['pubs'] = df_B['pubs']" + ] + }, + { + "cell_type": "code", + "execution_count": 176, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationpubs
name
Cardiff3351452100.0
Swansea2303001680.0
Abergavenny12515198.0
Machynlleth223548.0
Winchester40005NaN
\n", + "
" + ], + "text/plain": [ + " population pubs\n", + "name \n", + "Cardiff 335145 2100.0\n", + "Swansea 230300 1680.0\n", + "Abergavenny 12515 198.0\n", + "Machynlleth 2235 48.0\n", + "Winchester 40005 NaN" + ] + }, + "execution_count": 176, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "nan" + ] + }, + "execution_count": 177, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_C['pubs'].loc['Winchester']" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "numpy.float64" + ] + }, + "execution_count": 178, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df_C['pubs'].loc['Winchester'])" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [], + "source": [ + "df_C['pubs'] = df_C['pubs'].fillna(850)" + ] + }, + { + "cell_type": "code", + "execution_count": 181, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
populationpubs
name
Cardiff3351452100.0
Swansea2303001680.0
Abergavenny12515198.0
Machynlleth223548.0
Winchester40005850.0
\n", + "
" + ], + "text/plain": [ + " population pubs\n", + "name \n", + "Cardiff 335145 2100.0\n", + "Swansea 230300 1680.0\n", + "Abergavenny 12515 198.0\n", + "Machynlleth 2235 48.0\n", + "Winchester 40005 850.0" ] }, - "execution_count": 160, + "execution_count": 181, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "df_C" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.1 Read data from files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a dataframe using read_csv()\n", + "# Here, we would use os.path.join() to \n", + "df = pd.read_csv('pluto_18v2_1.csv')\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = df[::20]\n", + "df.shape\n", + "df.to_csv('pluto_shortened.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('pluto_shortened.csv')\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df.head()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 2.2 Save data back to files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(os.path.join( [ your file location here!! ]) )" + ] } ], "metadata": {