In [1]:
from pandas import Series, DataFrame #For convinence
import pandas as pd

pandas的数据结构

Series

In [2]:
obj = Series([1,4,5,8])
obj
Out[2]:
0    1
1    4
2    5
3    8
dtype: int64
In [3]:
obj.values
Out[3]:
array([1, 4, 5, 8], dtype=int64)
In [4]:
obj.index
Out[4]:
RangeIndex(start=0, stop=4, step=1)
In [5]:
obj2 = Series([1, 4, 5, 8], index=['a', 'b', 'c', 'd'])
obj2
Out[5]:
a    1
b    4
c    5
d    8
dtype: int64
In [6]:
obj2['c']
Out[6]:
5
In [7]:
obj2[['a','b','c']]
Out[7]:
a    1
b    4
c    5
dtype: int64
In [8]:
obj2['a']=0
obj2
Out[8]:
a    0
b    4
c    5
d    8
dtype: int64
In [9]:
obj2[obj2>0]
Out[9]:
b    4
c    5
d    8
dtype: int64
In [10]:
obj2**2
Out[10]:
a     0
b    16
c    25
d    64
dtype: int64
In [11]:
dict1={'a':0,'b':300,'c':500}
obj3 = Series(dict1)
obj3
Out[11]:
a      0
b    300
c    500
dtype: int64
In [12]:
index = ['d','c','b','a']
obj4 = Series(dict1, index)
obj4
Out[12]:
d      NaN
c    500.0
b    300.0
a      0.0
dtype: float64
In [13]:
print pd.notnull(obj4)
print '====='
print pd.isnull(obj4)
d    False
c     True
b     True
a     True
dtype: bool
=====
d     True
c    False
b    False
a    False
dtype: bool
In [14]:
print obj3

obj4
a      0
b    300
c    500
dtype: int64
Out[14]:
d      NaN
c    500.0
b    300.0
a      0.0
dtype: float64
In [15]:
obj3 + obj4 #数据对齐
Out[15]:
a       0.0
b     600.0
c    1000.0
d       NaN
dtype: float64
In [16]:
obj4.name = 'series'
obj4.index.name = "index"
obj4
Out[16]:
index
d      NaN
c    500.0
b    300.0
a      0.0
Name: series, dtype: float64
In [17]:
obj4.index=['d','e','f','g'] #修改索引
obj4
Out[17]:
d      NaN
e    500.0
f    300.0
g      0.0
Name: series, dtype: float64

DataFrame

In [18]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame
Out[18]:
pop state year
0 1.5 Ohio 2000
1 1.7 Ohio 2001
2 3.6 Ohio 2002
3 2.4 Nevada 2001
4 2.9 Nevada 2002
In [19]:
frame2 = DataFrame(data, columns=['year','state','pop', 'debt'],
          index=['a','b','c','d', 'e'])
frame2
Out[19]:
year state pop debt
a 2000 Ohio 1.5 NaN
b 2001 Ohio 1.7 NaN
c 2002 Ohio 3.6 NaN
d 2001 Nevada 2.4 NaN
e 2002 Nevada 2.9 NaN
In [20]:
frame.columns
Out[20]:
Index([u'pop', u'state', u'year'], dtype='object')
In [21]:
frame.year
Out[21]:
0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64
In [22]:
frame['year']
Out[22]:
0    2000
1    2001
2    2002
3    2001
4    2002
Name: year, dtype: int64
In [23]:
frame.ix[3]
Out[23]:
pop         2.4
state    Nevada
year       2001
Name: 3, dtype: object
In [24]:
frame2['debt']=150
frame2
Out[24]:
year state pop debt
a 2000 Ohio 1.5 150
b 2001 Ohio 1.7 150
c 2002 Ohio 3.6 150
d 2001 Nevada 2.4 150
e 2002 Nevada 2.9 150
In [25]:
frame2['debt'] = range(5)
frame2
Out[25]:
year state pop debt
a 2000 Ohio 1.5 0
b 2001 Ohio 1.7 1
c 2002 Ohio 3.6 2
d 2001 Nevada 2.4 3
e 2002 Nevada 2.9 4
In [26]:
val = Series([1,3,4], index=['b','d','e'])
frame2['debt']=val
frame2
Out[26]:
year state pop debt
a 2000 Ohio 1.5 NaN
b 2001 Ohio 1.7 1.0
c 2002 Ohio 3.6 NaN
d 2001 Nevada 2.4 3.0
e 2002 Nevada 2.9 4.0
In [27]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2
Out[27]:
year state pop debt eastern
a 2000 Ohio 1.5 NaN True
b 2001 Ohio 1.7 1.0 True
c 2002 Ohio 3.6 NaN True
d 2001 Nevada 2.4 3.0 False
e 2002 Nevada 2.9 4.0 False
In [28]:
dictt = {'a': {1:300, 2:500, 3:400}, 'b':{1:200,2:300,3:500}, 'c':{1:130,2:450,3:460}}
DataFrame(dictt)
Out[28]:
a b c
1 300 200 130
2 500 300 450
3 400 500 460
In [29]:
frame3 = DataFrame(dictt,index=[2,3,4])
frame3
Out[29]:
a b c
2 500.0 300.0 450.0
3 400.0 500.0 460.0
4 NaN NaN NaN
In [30]:
frame3.index.name = 'row'; frame3.columns.name = 'columns' 
frame3
Out[30]:
columns a b c
row
2 500.0 300.0 450.0
3 400.0 500.0 460.0
4 NaN NaN NaN
In [31]:
frame3.values
Out[31]:
array([[ 500.,  300.,  450.],
       [ 400.,  500.,  460.],
       [  nan,   nan,   nan]])

索引对象

In [32]:
obj = Series(range(3), index=['a','b','c'])
obj
Out[32]:
a    0
b    1
c    2
dtype: int64
In [33]:
index = obj.index
index
Out[33]:
Index([u'a', u'b', u'c'], dtype='object')
In [34]:
index[1:3]
Out[34]:
Index([u'b', u'c'], dtype='object')

Index对象是不可修改的(immutable),保证了Index对象可以在数据结构之间安全共享;

In [35]:
index[1]='d'
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-35-d3f90986bdb1> in <module>()
----> 1 index[1]='d'

C:\Users\diary\Anaconda2\lib\site-packages\pandas\indexes\base.pyc in __setitem__(self, key, value)
   1402 
   1403     def __setitem__(self, key, value):
-> 1404         raise TypeError("Index does not support mutable operations")
   1405 
   1406     def __getitem__(self, key):

TypeError: Index does not support mutable operations
In [36]:
obj2 = Series(range(3), index=index)
obj2
Out[36]:
a    0
b    1
c    2
dtype: int64
In [37]:
obj2.index is index
Out[37]:
True
In [38]:
'c' in index
Out[38]:
True

Index的方法和属性:append, diff, intersection, union, isin, delete, drop, insert, is_monotonic, is_unique, unique

(方法的输入也须是index对象)

基本功能——操作数据结构中数据的基本手段

重新索引

In [39]:
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d','b','a','c'])
obj
Out[39]:
d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
In [40]:
obj.reindex(['a','b','c','d','e']) #rearrangement
Out[40]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64
In [41]:
obj.reindex(['a','b','c','d','e'], fill_value=0)
Out[41]:
a   -5.3
b    7.2
c    3.6
d    4.5
e    0.0
dtype: float64
In [42]:
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])
obj3
Out[42]:
0      blue
2    purple
4    yellow
dtype: object
In [43]:
obj3.reindex(range(6),method='ffill')
Out[43]:
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object
  • reindex的method选项:

    ffill/pad 前向填充/搬运值

    bfill/backfill 后向填充/搬运值

  • 也可以对行和列同时进行重排,如:frame.reindex(index=[.......], method='ffill', columns=[......]),如果不需要method参数,可以用frame.ix([......], [.......])

丢弃指定轴上的项

In [44]:
import numpy as np
obj4 = DataFrame(np.arange(16).reshape((4,4)), index=['a','b','c','d'], columns=['one','two','three','four'])
obj4
Out[44]:
one two three four
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
d 12 13 14 15
In [45]:
obj4.drop('c')
Out[45]:
one two three four
a 0 1 2 3
b 4 5 6 7
d 12 13 14 15
In [46]:
obj4.drop(['a','b'])
Out[46]:
one two three four
c 8 9 10 11
d 12 13 14 15
In [47]:
obj4.drop('one',axis=1)
Out[47]:
two three four
a 1 2 3
b 5 6 7
c 9 10 11
d 13 14 15
In [48]:
obj4.drop(['one','two'],axis=1)
Out[48]:
three four
a 2 3
b 6 7
c 10 11
d 14 15

索引、选取和过滤

In [49]:
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
obj
Out[49]:
a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
In [50]:
obj[2] == obj['c']
Out[50]:
True
In [51]:
obj[[1,2]]
Out[51]:
b    1.0
c    2.0
dtype: float64
In [52]:
obj[obj<2]
Out[52]:
a    0.0
b    1.0
dtype: float64
In [53]:
obj['a':'c']
Out[53]:
a    0.0
b    1.0
c    2.0
dtype: float64
In [54]:
obj['a':'c']=5
obj
Out[54]:
a    5.0
b    5.0
c    5.0
d    3.0
dtype: float64
In [55]:
data = DataFrame(np.arange(16).reshape((4,4)), index = ['a','b','c','d'],columns=['1','2','3','4'])
data
Out[55]:
1 2 3 4
a 0 1 2 3
b 4 5 6 7
c 8 9 10 11
d 12 13 14 15
In [56]:
data['3']>5
Out[56]:
a    False
b     True
c     True
d     True
Name: 3, dtype: bool
In [57]:
data[data['3']>5]
Out[57]:
1 2 3 4
b 4 5 6 7
c 8 9 10 11
d 12 13 14 15
In [58]:
data.ix[['a','b'],['1','3']]
Out[58]:
1 3
a 0 2
b 4 6
In [59]:
data.ix[['a','b'],:]
Out[59]:
1 2 3 4
a 0 1 2 3
b 4 5 6 7
In [60]:
data.xs('a')
Out[60]:
1    0
2    1
3    2
4    3
Name: a, dtype: int32

算术运算和数据对齐

In [61]:
obj
Out[61]:
a    5.0
b    5.0
c    5.0
d    3.0
dtype: float64
In [62]:
obj2
Out[62]:
a    0
b    1
c    2
dtype: int64
In [63]:
obj+obj2
Out[63]:
a    5.0
b    6.0
c    7.0
d    NaN
dtype: float64
  • 以上用法还可以用于DataFrame,没有重叠的位置同样会等于NaN

在算术方法中填充值

In [64]:
obj.add(obj2,fill_value='1') #4==obj['d']+fill_value
Out[64]:
a    5.0
b    6.0
c    7.0
d    4.0
dtype: float64

add-加法,sub-减法,div-除法,mul-乘法

DataFrame和Series之间的运算

In [65]:
frame = DataFrame(np.arange(16).reshape((4,4)))
frame
Out[65]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
In [66]:
srs = frame.ix[0]
srs
Out[66]:
0    0
1    1
2    2
3    3
Name: 0, dtype: int32
In [67]:
frame - srs
Out[67]:
0 1 2 3
0 0 0 0 0
1 4 4 4 4
2 8 8 8 8
3 12 12 12 12
In [68]:
srs1 = frame.ix[:][0]
srs1
Out[68]:
0     0
1     4
2     8
3    12
Name: 0, dtype: int32
In [69]:
frame = DataFrame(np.arange(16).reshape((4,4)))
frame - srs1
Out[69]:
0 1 2 3
0 0 -3 -6 -9
1 4 1 -2 -5
2 8 5 2 -1
3 12 9 6 3
In [70]:
srs2 = Series(range(3), index=[0,2,3])
srs2
Out[70]:
0    0
2    1
3    2
dtype: int64
In [71]:
frame - srs2
Out[71]:
0 1 2 3
0 0.0 NaN 1.0 1.0
1 4.0 NaN 5.0 5.0
2 8.0 NaN 9.0 9.0
3 12.0 NaN 13.0 13.0
In [72]:
frame
Out[72]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
In [73]:
srs3 = frame[2]
srs3
Out[73]:
0     2
1     6
2    10
3    14
Name: 2, dtype: int32
In [74]:
frame - srs3
Out[74]:
0 1 2 3
0 -2 -5 -8 -11
1 2 -1 -4 -7
2 6 3 0 -3
3 10 7 4 1
In [75]:
frame.sub(srs3, axis=0)
Out[75]:
0 1 2 3
0 -2 -1 0 1
1 -2 -1 0 1
2 -2 -1 0 1
3 -2 -1 0 1

函数应用和映射

In [76]:
f = lambda x : x.max() - x.min()
frame
Out[76]:
0 1 2 3
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
In [77]:
frame.apply(f)
Out[77]:
0    12
1    12
2    12
3    12
dtype: int64
In [78]:
 def g(x):
        return Series([x.max(), x.min()])
frame.apply(g)
Out[78]:
0 1 2 3
0 12 13 14 15
1 0 1 2 3
In [79]:
h = lambda x: '%.2f' %x
frame.applymap(h)
Out[79]:
0 1 2 3
0 0.00 1.00 2.00 3.00
1 4.00 5.00 6.00 7.00
2 8.00 9.00 10.00 11.00
3 12.00 13.00 14.00 15.00

排序和排名

sort_index()方法可以对Series和DataFrame按照索引的字典序进行排序,默认对列排序,可添加选项axis=1对行排序;默认升序,可添加选项ascending=False进行降序排序;如果只希望对某一行或某一列进行排序,可以通过添加by=['']选项达到目的;

order方法则是按值排序,缺失值会被放到末尾;

带有重复值的轴索引

pandas的数据结构的轴标签并非强制需要唯一;.index.unique属性可以判定是否唯一

汇总和计算描述统计

DataFrame有sum,mean等方法可以对其进行各类运算,可添加axis(默认按列),skipna(排除缺失值,默认True)等选项;

下表列出所有与描述统计相关的方法:

方法 说明
count 非NA的数量
describe 针对pandas的两种数据结构计算汇总统计
min、max 最小/大值
argmin、argmax 最小/大值的索引位置
idxmin、idxmax 最小/大值的索引值
quantile 样本分位数
sum、mean 总和/平均值
median 中位数
mad 根据平均值计算平均绝对离差
var、std 方差/标准差
skew、kurt 偏度/峰度(三/四阶矩)
cumsum 累计和
cummin、cummax 累计最大/最小值
cumprod 累计积
diff 一阶差分
pct_change 百分数变化

唯一值、值计数以及成员资格

In [80]:
obj = Series(['a','a','a','a','a','b','b','b','b','b','c','c','c','c','c','c','c','c','c','c','d','d','d'])
obj
Out[80]:
0     a
1     a
2     a
3     a
4     a
5     b
6     b
7     b
8     b
9     b
10    c
11    c
12    c
13    c
14    c
15    c
16    c
17    c
18    c
19    c
20    d
21    d
22    d
dtype: object
In [81]:
obj.unique()
Out[81]:
array(['a', 'b', 'c', 'd'], dtype=object)
In [82]:
obj.value_counts()
Out[82]:
c    10
b     5
a     5
d     3
dtype: int64
In [83]:
mask = obj.isin(['a','c'])
mask
Out[83]:
0      True
1      True
2      True
3      True
4      True
5     False
6     False
7     False
8     False
9     False
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20    False
21    False
22    False
dtype: bool
In [84]:
obj[mask]
Out[84]:
0     a
1     a
2     a
3     a
4     a
10    c
11    c
12    c
13    c
14    c
15    c
16    c
17    c
18    c
19    c
dtype: object

处理缺失数据

方法 说明
dropna 根据各标签的值中是否存在缺失数据对轴标签进行过滤,可通过阈值调节对缺失值的容忍度
fillna 用指定值或插值方法填充缺失数据
isnull 返回一个含布尔值的对象判断哪些是缺失值/NA
notnull isnull的否定式

滤除缺失数据

In [85]:
Series([1,np.nan,3,np.nan])
Out[85]:
0    1.0
1    NaN
2    3.0
3    NaN
dtype: float64
In [86]:
Series([1,np.nan,3,np.nan]).dropna()
Out[86]:
0    1.0
2    3.0
dtype: float64
In [87]:
DataFrame([[1,2,np.nan],[2,np.nan,4],[np.nan,9,2],[1,4,5],[np.nan,np.nan,np.nan]])
Out[87]:
0 1 2
0 1.0 2.0 NaN
1 2.0 NaN 4.0
2 NaN 9.0 2.0
3 1.0 4.0 5.0
4 NaN NaN NaN
In [88]:
DataFrame([[1,2,np.nan],[2,np.nan,4],[np.nan,9,2],[1,4,5],[np.nan,np.nan,np.nan]]).dropna()
Out[88]:
0 1 2
3 1.0 4.0 5.0
In [89]:
DataFrame([[1,2,np.nan],[2,np.nan,4],[np.nan,9,2],[1,4,5],[np.nan,np.nan,np.nan]]).dropna(how='all')
Out[89]:
0 1 2
0 1.0 2.0 NaN
1 2.0 NaN 4.0
2 NaN 9.0 2.0
3 1.0 4.0 5.0
In [90]:
DataFrame([[1,2,np.nan,3],[2,np.nan,4,5],[np.nan,9,2,8],[1,4,5,7]])
Out[90]:
0 1 2 3
0 1.0 2.0 NaN 3
1 2.0 NaN 4.0 5
2 NaN 9.0 2.0 8
3 1.0 4.0 5.0 7
In [91]:
DataFrame([[1,2,np.nan,3],[2,np.nan,4,5],[np.nan,9,2,8],[1,4,5,7]]).dropna(axis=1)
Out[91]:
3
0 3
1 5
2 8
3 7

填充缺失数据

fillna()方法可以将缺失值填充为指定常数,如fillna(0),也可以通过传入一个dict指定那些列替换成哪些值,如fillna({1 : 0.3, 3 : 1})

层次化索引

In [92]:
data = Series(np.random.randn(10),index = [['a','a','a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1,2,4,1,2,3,1,2,1,2]])
data
Out[92]:
a  1    0.756407
   2   -1.492208
   4   -0.622838
b  1   -0.560412
   2   -1.298855
   3    1.346179
c  1    1.039435
   2   -0.109247
d  1   -2.219623
   2   -0.435208
dtype: float64
In [93]:
data['a']
Out[93]:
1    0.756407
2   -1.492208
4   -0.622838
dtype: float64
In [94]:
data[:,2]
Out[94]:
a   -1.492208
b   -1.298855
c   -0.109247
d   -0.435208
dtype: float64
In [95]:
data.unstack()
Out[95]:
1 2 3 4
a 0.756407 -1.492208 NaN -0.622838
b -0.560412 -1.298855 1.346179 NaN
c 1.039435 -0.109247 NaN NaN
d -2.219623 -0.435208 NaN NaN
In [96]:
data.unstack().stack()
Out[96]:
a  1    0.756407
   2   -1.492208
   4   -0.622838
b  1   -0.560412
   2   -1.298855
   3    1.346179
c  1    1.039435
   2   -0.109247
d  1   -2.219623
   2   -0.435208
dtype: float64

对于DataFrame的话,则每条轴都可以有分层索引