Mary,F,7065
Anna,F,2604
Emma,F,2003
Elizabeth,F,1939
Minnie,F,1746
Margaret,F,1578
Ida,F,1472
Alice,F,1414
Bertha,F,1320
。。。
In [11]: cd "d:\\Desktop"
d:\Desktop
In [12]: pwd
Out[12]: u'D:\\Desktop'
In [13]: names1880 = pd.read_csv('names/yob1880.txt', names= ['name', 'sex', 'births'])
In [14]: names1880
Out[14]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Data columns (total 3 columns):
name 2000 non-null values
sex 2000 non-null values
births 2000 non-null values
dtypes: int64(1), object(2)
In [16]: names1880.groupby('sex').births.sum()
Out[16]:
sex
F 90993
M 110493
Name: births, dtype: int64
#1880 - 2010 的数据统计
years = range(1880, 2011 )
pieces = []
columns = names= ['name', 'sex', 'births']
for year in years:
path = 'names/yob%d.txt' % year
frame = pd.read_csv(path, names = columns)
frame['year'] = year
pieces.append(frame)
#利用pd.concat 连接数据,将所有数据整合到单个DataFrame中
#注意利用ignore_index = True 可以忽略read_csv 所返回的原始行号
names = pd.concat(pieces, ignore_index = True)
In [36]: names
Out[36]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1690784 entries, 0 to 1690783
Data columns (total 4 columns):
name 1690784 non-null values
sex 1690784 non-null values
births 1690784 non-null values
year 1690784 non-null values
dtypes: int64(2), object(2)
#观察下按照sex & year 统计的出生数量
total_births = names.pivot_table('births', rows= 'year', cols='sex', aggfunc=sum)
total_births.tail()
#来,我们画个图看看是否重男轻女?
total_births.plot(title='Total births by sex and year')
#我们再插入一个prop列,记录指定婴儿数量相对于总出生人数的比例,我们将按year&sex分组以后进行累加计算比例后新增加到每个分组中
def add_prop(group):
#按照整数除法会出事。。。
births = group.births.astype(float)
group['prop']=births /births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
#这里供大家偷懒,我写了,大家可以粘贴过去运行, **后面就不赘述了。。 **
def get_top1k(group):
return group.sort_index(by = 'births', ascending = False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1k = grouped.apply(get_top1k)
#粘贴过去运行是这个样子的:
In [37]: def get_top1k(group):
....: return group.sort_index(by = 'births', ascending = False)[:1000]
....:
In [38]: grouped = names.groupby(['year', 'sex'])
In [39]: top1k = grouped.apply(get_top1k)
In [40]: top1k
Out[40]:
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 261877 entries, (1880, F, 0) to (2010, M, 1677643)
Data columns (total 4 columns):
name 261877 non-null values
sex 261877 non-null values
births 261877 non-null values
year 261877 non-null values
dtypes: int64(2), object(2)
#首先我们将前1000的名字分成男女两部分
boys = top1k[top1k.sex == 'M']
girls = top1k[top1k.sex == 'F']
#按照year & name 统计的总出生数透视表格
total_births = top1k.pivot_table('births', rows= 'year', cols = 'name', aggfunc = sum)
#整理以后可以利用DataFrame 的plot方法绘制几个名字的曲线图
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn' ]]
subset.plot(subplots=True, figsize = (12,10), grid= False, title='Number of births per year')
table = top1k.pivot_table('prop', rows='year', cols='sex', aggfunc = sum)
table.plot(title='Sum of table1k.prop by year & sex', yticks=np.linspace(0, 1.2, 13), xticks= range(1880, 2020, 10))
* 啊呀,果然多样性越来越厉害了挖。。。。特别是女生,大家都懂的。 :-D
import pandas as pd;import numpy as np
#1880 - 2010 的数据统计
years = range(1880, 2011 )
pieces = []
columns = names= ['name', 'sex', 'births']
for year in years:
path = 'names/yob%d.txt' % year
frame = pd.read_csv(path, names = columns)
frame['year'] = year
pieces.append(frame)
#利用pd.concat 连接数据,将所有数据整合到单个DataFrame中
#注意利用ignore_index = True 可以忽略read_csv 所返回的原始行号
names = pd.concat(pieces, ignore_index = True)
#观察下按照sex & year 统计的出生数量
total_births = names.pivot_table('births', rows= 'year', cols='sex', aggfunc=sum) //应该改为total_births = names.pivot_table('births', 'year', 'sex', aggfunc=sum)
total_births.tail()
#来,我们画个图看看是否重男轻女?
total_births.plot(title='Total births by sex and year')
#我们再插入一个prop列,记录指定婴儿数量相对于总出生人数的比例,我们将按year&sex分组以后进行累加计算比例后新增加到每个分组中
def add_prop(group):
#按照整数除法会出事。。。
births = group.births.astype(float)
group['prop']=births /births.sum()
return group
names = names.groupby(['year', 'sex']).apply(add_prop)
#top 1k 分组
def get_top1k(group):
return group.sort_index(by = 'births', ascending = False)[:1000]
grouped = names.groupby(['year', 'sex'])
top1k = grouped.apply(get_top1k)
#首先我们将前1000的名字分成男女两部分
boys = top1k[top1k.sex == 'M']
girls = top1k[top1k.sex == 'F']
#按照year & name 统计的总出生数透视表格
total_births = top1k.pivot_table('births', rows= 'year', cols = 'name', aggfunc = sum)
#整理以后可以利用DataFrame 的plot方法绘制几个名字的曲线图
subset = total_births[['John', 'Harry', 'Mary', 'Marilyn' ]]
subset.plot(subplots=True, figsize = (12,10), grid= False, title='Number of births per year')
table = top1k.pivot_table('prop', rows='year', cols='sex', aggfunc = sum)
table.plot(title='Sum of table1k.prop by year & sex', yticks=np.linspace(0, 1.2, 13), xticks= range(1880, 2020, 10))