import pandas as pd

place_dictionary={'city':['Madrid','Shanghai','Corvallis'],'country':['Spain','China','USA'],
                  'population':[4000000,26000000,60000]}
place_df = pd.DataFrame(place_dictionary)
place_df

place_list=[['Madrid', 'Spain', 4000000], ['Shanghai', 'China', 26000000], ['Corvallis', 'USA', 60000]]
place_df = pd.DataFrame(place_list, columns=['city', 'country', 'population'])
place_df

place_df.loc[1, 'country']

'China'

place_df.loc[:, 'population']

0     4000000
1    26000000
2       60000
Name: population, dtype: int64

population_df = place_df.loc[:,['city', 'population']]
population_df

place_df.iloc[0,0]

'Madrid'

place_df.iloc[0:2,2]

0     4000000
1    26000000
Name: population, dtype: int64

# Replacement in the output
place_df.replace('Corvallis', 'Beavertown')

# The original DataFrame has not changed
place_df

# Permenant replacment to the DataFrame
place_df.replace('Corvallis', 'Beavertown', inplace=True)
place_df

# Using a dictionary
replacement_dict={'USA':'United States','Spain':'Espana'}
place_df.replace(replacement_dict, inplace= True)
place_df

# A more precise way to use a replacement dictionary
replacement_dict={'country':{'USA':'United States','Spain':'Espana'}}
place_df.replace(replacement_dict, inplace= True)
place_df

# Familiarizing ourselves with the data
df = pd.read_csv("surveys.csv")
df

# Grouping by year and taking the average of each column that has numerical values
df.groupby('year').mean(numeric_only=True)

# Isolating one specific column
df.groupby('year')['weight'].mean()

year
1977    46.650376
1978    67.911290
1979    63.390282
1980    62.448747
1981    65.843888
1982    53.765888
1983    55.102828
1984    50.955682
1985    46.694570
1986    55.053927
1987    49.443639
1988    45.055765
1989    35.754691
1990    35.483044
1991    32.040033
1992    33.328160
1993    34.271028
1994    34.546595
1995    29.530387
1996    28.207174
1997    31.761885
1998    34.826523
1999    36.503256
2000    32.394591
2001    36.469529
2002    35.641555
Name: weight, dtype: float64

# Grouping by year and counting the number of observations that are not NaN
df.groupby('year').count()

# Using different operations on different columns that generate new summary columns
df.groupby('year').agg(mean_hindfoot = ('hindfoot_length','mean'), 
                             max_weight = ('weight', 'max'))

# DataFrame indexed my year
hindfoot_df = df.groupby('year')['hindfoot_length'].mean()
hindfoot_df

year
1977    36.276961
1978    37.278880
1979    35.167219
1980    35.082035
1981    35.538830
1982    31.710897
1983    32.390635
1984    31.929385
1985    32.617844
1986    33.057444
1987    29.553377
1988    28.190691
1989    25.792094
1990    28.408658
1991    25.953202
1992    26.859956
1993    27.285935
1994    29.757194
1995    26.915146
1996    26.328422
1997    26.177299
1998    28.202515
1999    28.322160
2000    26.130375
2001    27.140363
2002    27.242353
Name: hindfoot_length, dtype: float64

# Year is now a column and there are seperate indicies
hindfoot_df.reset_index()

# Year is removed as the index and not included as a column
hindfoot_df.reset_index(drop=True)

0     36.276961
1     37.278880
2     35.167219
3     35.082035
4     35.538830
5     31.710897
6     32.390635
7     31.929385
8     32.617844
9     33.057444
10    29.553377
11    28.190691
12    25.792094
13    28.408658
14    25.953202
15    26.859956
16    27.285935
17    29.757194
18    26.915146
19    26.328422
20    26.177299
21    28.202515
22    28.322160
23    26.130375
24    27.140363
25    27.242353
Name: hindfoot_length, dtype: float64

	record_id	month	day	plot_id	hindfoot_length	weight
year
1977	252.0	9.568588	14.986083	11.616302	36.276961	46.650376
1978	1027.5	6.560115	8.339695	10.822519	37.278880	67.911290
1979	1911.0	6.589708	23.025035	12.029207	35.167219	63.390282
1980	2978.0	5.942049	15.309541	11.649470	35.082035	62.448747
1981	4421.5	5.790082	13.550272	11.864130	35.538830	65.843888
1982	6146.5	6.280081	23.142568	11.943883	31.710897	53.765888
1983	7972.0	6.148834	15.041841	11.901375	32.390635	55.102828
1984	9299.0	6.486239	17.147808	11.697248	31.929385	50.955682
1985	10508.5	6.429764	17.654381	10.566064	32.617844	46.694570
1986	11698.5	7.366242	9.264331	11.032909	33.057444	55.053927
1987	13005.0	6.470975	19.435069	11.404548	29.553377	49.443639
1988	14575.0	6.107556	15.264806	11.089857	28.190691	45.055765
1989	16094.0	5.940089	7.652645	11.400255	25.792094	35.754691
1990	17534.0	5.699466	20.770404	10.594966	28.408658	35.483044
1991	18863.0	6.792873	12.401633	10.647365	25.953202	32.040033
1992	20055.5	5.748555	15.296724	10.765896	26.859956	33.328160
1993	20949.5	6.362667	18.480000	12.153333	27.285935	34.271028
1994	21658.5	6.652695	10.502994	11.586826	29.757194	34.546595
1995	22603.5	7.557283	18.128478	10.960720	26.915146	29.530387
1996	24067.5	6.225088	19.032825	11.059789	26.328422	28.207174
1997	26167.0	6.328520	16.446049	11.681107	26.177299	31.761885
1998	28218.5	6.845963	19.727329	11.909317	28.202515	34.826523
1999	29591.0	5.661674	13.646696	11.540969	28.322160	36.503256
2000	30934.5	6.154639	15.262887	11.091495	26.130375	32.394591
2001	32515.5	7.576398	20.282609	11.676398	27.140363	36.469529
2002	34435.0	7.084343	13.198744	11.487663	27.242353	35.641555

	record_id	month	day	plot_id	species_id	sex	hindfoot_length	weight
year
1977	503	503	503	503	487	418	408	266
1978	1048	1048	1048	1048	992	936	857	868
1979	719	719	719	719	658	651	604	638
1980	1415	1415	1415	1415	1375	1332	963	1317
1981	1472	1472	1472	1472	1417	1376	1043	1358
1982	1978	1978	1978	1978	1964	1850	1560	1841
1983	1673	1673	1673	1673	1652	1568	1559	1556
1984	981	981	981	981	951	888	878	880
1985	1438	1438	1438	1438	1416	1352	1345	1326
1986	942	942	942	942	922	869	853	853
1987	1671	1671	1671	1671	1668	1539	1377	1517
1988	1469	1469	1469	1469	1457	1341	1332	1327
1989	1569	1569	1569	1569	1562	1458	1366	1439
1990	1311	1311	1311	1311	1296	1217	1155	1209
1991	1347	1347	1347	1347	1332	1243	1218	1224
1992	1038	1038	1038	1038	1008	920	914	902
1993	750	750	750	750	714	665	647	642
1994	668	668	668	668	615	570	556	558
1995	1222	1222	1222	1222	1180	1108	1096	1086
1996	1706	1706	1706	1706	1689	1635	1629	1617
1997	2493	2493	2493	2493	2492	2428	2414	2419
1998	1610	1610	1610	1610	1590	1515	1511	1395
1999	1135	1135	1135	1135	1108	1075	1074	1075
2000	1552	1552	1552	1552	1509	1469	1465	1442
2001	1610	1610	1610	1610	1540	1487	1489	1444
2002	2229	2229	2229	2229	2192	2128	2125	2084

More Functionalities of Pandas¶

Creating a Pandas DataFrame from a dictionary¶

Creating a Pandas DataFrame from a list of lists¶

Accessing values from the DataFrame with `.loc`¶

Using `.iloc` (integer location) as an alternative¶

Replacing values in the DataFrame¶

Using `.groupby()` to combine data of the same category¶

Changing the indexing with `reset_index()`¶

	city	country	population
0	Madrid	Spain	4000000
1	Shanghai	China	26000000
2	Beavertown	USA	60000

	city	country	population
0	Madrid	Espana	4000000
1	Shanghai	China	26000000
2	Beavertown	United States	60000

	record_id	month	day	year	plot_id	species_id	sex	hindfoot_length	weight
0	1	7	16	1977	2	NL	M	32.0	NaN
1	2	7	16	1977	3	NL	M	33.0	NaN
2	3	7	16	1977	2	DM	F	37.0	NaN
3	4	7	16	1977	7	DM	M	36.0	NaN
4	5	7	16	1977	3	DM	M	35.0	NaN
...	...	...	...	...	...	...	...	...	...
35544	35545	12	31	2002	15	AH	NaN	NaN	NaN
35545	35546	12	31	2002	15	AH	NaN	NaN	NaN
35546	35547	12	31	2002	10	RM	F	15.0	14.0
35547	35548	12	31	2002	7	DO	M	36.0	51.0
35548	35549	12	31	2002	5	NaN	NaN	NaN	NaN

More Functionalities of Pandas¶

Creating a Pandas DataFrame from a dictionary¶

Creating a Pandas DataFrame from a list of lists¶

Accessing values from the DataFrame with .loc¶

Using .iloc (integer location) as an alternative¶

Replacing values in the DataFrame¶

Using .groupby() to combine data of the same category¶

Changing the indexing with reset_index()¶

Accessing values from the DataFrame with `.loc`¶

Using `.iloc` (integer location) as an alternative¶

Using `.groupby()` to combine data of the same category¶

Changing the indexing with `reset_index()`¶