from pandas import Series, DataFrame #For convinence
import pandas as pd
obj = Series([1,4,5,8])
obj
obj.values
obj.index
obj2 = Series([1, 4, 5, 8], index=['a', 'b', 'c', 'd'])
obj2
obj2['c']
obj2[['a','b','c']]
obj2['a']=0
obj2
obj2[obj2>0]
obj2**2
dict1={'a':0,'b':300,'c':500}
obj3 = Series(dict1)
obj3
index = ['d','c','b','a']
obj4 = Series(dict1, index)
obj4
print pd.notnull(obj4)
print '====='
print pd.isnull(obj4)
print obj3
obj4
obj3 + obj4 #数据对齐
obj4.name = 'series'
obj4.index.name = "index"
obj4
obj4.index=['d','e','f','g'] #修改索引
obj4
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame
frame2 = DataFrame(data, columns=['year','state','pop', 'debt'],
index=['a','b','c','d', 'e'])
frame2
frame.columns
frame.year
frame['year']
frame.ix[3]
frame2['debt']=150
frame2
frame2['debt'] = range(5)
frame2
val = Series([1,3,4], index=['b','d','e'])
frame2['debt']=val
frame2
frame2['eastern'] = frame2.state == 'Ohio'
frame2
dictt = {'a': {1:300, 2:500, 3:400}, 'b':{1:200,2:300,3:500}, 'c':{1:130,2:450,3:460}}
DataFrame(dictt)
frame3 = DataFrame(dictt,index=[2,3,4])
frame3
frame3.index.name = 'row'; frame3.columns.name = 'columns'
frame3
frame3.values
obj = Series(range(3), index=['a','b','c'])
obj
index = obj.index
index
index[1:3]
Index对象是不可修改的(immutable),保证了Index对象可以在数据结构之间安全共享;
index[1]='d'
obj2 = Series(range(3), index=index)
obj2
obj2.index is index
'c' in index
Index的方法和属性:append, diff, intersection, union, isin, delete, drop, insert, is_monotonic, is_unique, unique
(方法的输入也须是index对象)
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d','b','a','c'])
obj
obj.reindex(['a','b','c','d','e']) #rearrangement
obj.reindex(['a','b','c','d','e'], fill_value=0)
obj3 = Series(['blue','purple','yellow'], index=[0,2,4])
obj3
obj3.reindex(range(6),method='ffill')
reindex的method选项:
ffill/pad 前向填充/搬运值
bfill/backfill 后向填充/搬运值
import numpy as np
obj4 = DataFrame(np.arange(16).reshape((4,4)), index=['a','b','c','d'], columns=['one','two','three','four'])
obj4
obj4.drop('c')
obj4.drop(['a','b'])
obj4.drop('one',axis=1)
obj4.drop(['one','two'],axis=1)
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
obj
obj[2] == obj['c']
obj[[1,2]]
obj[obj<2]
obj['a':'c']
obj['a':'c']=5
obj
data = DataFrame(np.arange(16).reshape((4,4)), index = ['a','b','c','d'],columns=['1','2','3','4'])
data
data['3']>5
data[data['3']>5]
data.ix[['a','b'],['1','3']]
data.ix[['a','b'],:]
data.xs('a')
obj
obj2
obj+obj2
obj.add(obj2,fill_value='1') #4==obj['d']+fill_value
add-加法,sub-减法,div-除法,mul-乘法
frame = DataFrame(np.arange(16).reshape((4,4)))
frame
srs = frame.ix[0]
srs
frame - srs
srs1 = frame.ix[:][0]
srs1
frame = DataFrame(np.arange(16).reshape((4,4)))
frame - srs1
srs2 = Series(range(3), index=[0,2,3])
srs2
frame - srs2
frame
srs3 = frame[2]
srs3
frame - srs3
frame.sub(srs3, axis=0)
f = lambda x : x.max() - x.min()
frame
frame.apply(f)
def g(x):
return Series([x.max(), x.min()])
frame.apply(g)
h = lambda x: '%.2f' %x
frame.applymap(h)
sort_index()方法可以对Series和DataFrame按照索引的字典序进行排序,默认对列排序,可添加选项axis=1对行排序;默认升序,可添加选项ascending=False进行降序排序;如果只希望对某一行或某一列进行排序,可以通过添加by=['']选项达到目的;
order方法则是按值排序,缺失值会被放到末尾;
pandas的数据结构的轴标签并非强制需要唯一;.index.unique属性可以判定是否唯一
DataFrame有sum,mean等方法可以对其进行各类运算,可添加axis(默认按列),skipna(排除缺失值,默认True)等选项;
下表列出所有与描述统计相关的方法:
方法 | 说明 |
---|---|
count | 非NA的数量 |
describe | 针对pandas的两种数据结构计算汇总统计 |
min、max | 最小/大值 |
argmin、argmax | 最小/大值的索引位置 |
idxmin、idxmax | 最小/大值的索引值 |
quantile | 样本分位数 |
sum、mean | 总和/平均值 |
median | 中位数 |
mad | 根据平均值计算平均绝对离差 |
var、std | 方差/标准差 |
skew、kurt | 偏度/峰度(三/四阶矩) |
cumsum | 累计和 |
cummin、cummax | 累计最大/最小值 |
cumprod | 累计积 |
diff | 一阶差分 |
pct_change | 百分数变化 |
obj = Series(['a','a','a','a','a','b','b','b','b','b','c','c','c','c','c','c','c','c','c','c','d','d','d'])
obj
obj.unique()
obj.value_counts()
mask = obj.isin(['a','c'])
mask
obj[mask]
方法 | 说明 |
---|---|
dropna | 根据各标签的值中是否存在缺失数据对轴标签进行过滤,可通过阈值调节对缺失值的容忍度 |
fillna | 用指定值或插值方法填充缺失数据 |
isnull | 返回一个含布尔值的对象判断哪些是缺失值/NA |
notnull | isnull的否定式 |
Series([1,np.nan,3,np.nan])
Series([1,np.nan,3,np.nan]).dropna()
DataFrame([[1,2,np.nan],[2,np.nan,4],[np.nan,9,2],[1,4,5],[np.nan,np.nan,np.nan]])
DataFrame([[1,2,np.nan],[2,np.nan,4],[np.nan,9,2],[1,4,5],[np.nan,np.nan,np.nan]]).dropna()
DataFrame([[1,2,np.nan],[2,np.nan,4],[np.nan,9,2],[1,4,5],[np.nan,np.nan,np.nan]]).dropna(how='all')
DataFrame([[1,2,np.nan,3],[2,np.nan,4,5],[np.nan,9,2,8],[1,4,5,7]])
DataFrame([[1,2,np.nan,3],[2,np.nan,4,5],[np.nan,9,2,8],[1,4,5,7]]).dropna(axis=1)
fillna()方法可以将缺失值填充为指定常数,如fillna(0),也可以通过传入一个dict指定那些列替换成哪些值,如fillna({1 : 0.3, 3 : 1})
data = Series(np.random.randn(10),index = [['a','a','a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],[1,2,4,1,2,3,1,2,1,2]])
data
data['a']
data[:,2]
data.unstack()
data.unstack().stack()
对于DataFrame的话,则每条轴都可以有分层索引