# 默认情况下，索引是整数，从0开始
import pandas as pd
import numpy as np
numbers = np.random.randn(5)
s = pd.Series(numbers)
s

0    0.423218
1   -0.489838
2    0.855377
3    0.886844
4   -0.061101
dtype: float64

# 可以从任何类似数组的结构（例如，numpy数组、Python列表、字典）创建pandas Series
# 可以用其他集合作为索引
idx = ['a', 'b', 'c', 'd', 'e']
s = pd.Series(numbers, index=idx)
s

a    0.423218
b   -0.489838
c    0.855377
d    0.886844
e   -0.061101
dtype: float64

# 可以从字典创建一个Series。键成为索引
d = {'a': 0, 'b' : 1, 'c': 2}
s = pd.Series(d)
s

a    0
b    1
c    2
dtype: int64

# Series支持切片，但不支持负索引。
s = pd.Series([2, 3, 5, 7, 11])
s[1:3]

1    3
2    5
dtype: int64

s[-1]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\range.py:414, in RangeIndex.get_loc(self, key)
    413 try:
--> 414     return self._range.index(new_key)
    415 except ValueError as err:

ValueError: -1 is not in range

The above exception was the direct cause of the following exception:

KeyError                                  Traceback (most recent call last)
Cell In[76], line 1
----> 1 s[-1]

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\series.py:1040, in Series.__getitem__(self, key)
   1037     return self._values[key]
   1039 elif key_is_scalar:
-> 1040     return self._get_value(key)
   1042 # Convert generator to list before going through hashable part
   1043 # (We will iterate through the generator there to check for slices)
   1044 if is_iterator(key):

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\series.py:1156, in Series._get_value(self, label, takeable)
   1153     return self._values[label]
   1155 # Similar to Index.get_value, but we do not fall back to positional
-> 1156 loc = self.index.get_loc(label)
   1158 if is_integer(loc):
   1159     return self._values[loc]

File C:\ProgramData\anaconda3\Lib\site-packages\pandas\core\indexes\range.py:416, in RangeIndex.get_loc(self, key)
    414         return self._range.index(new_key)
    415     except ValueError as err:
--> 416         raise KeyError(key) from err
    417 if isinstance(key, Hashable):
    418     raise KeyError(key)

KeyError: -1

s = pd.Series([2, 3, 5, 7, 11], index=['a', 'a', 'a', 'a', 'a'])
s['a']

a     2
a     3
a     5
a     7
a    11
dtype: int64

# Series对象就像np.ndarray对象一样，因此它们支持所有相同类型的切片操作
s[s>3]

a     5
a     7
a    11
dtype: int64

d = {'dog': 2, 'cat':3, 'bird':5}
s = pd.Series(d)
s ** 2

dog      4
cat      9
bird    25
dtype: int64

s['dog'] = 1
s

dog     1
cat     3
bird    5
dtype: int64

s.name = 'animals'
s

dog     1
cat     3
bird    5
Name: animals, dtype: int64

s.rename('mammal')

dog     1
cat     3
bird    5
Name: mammal, dtype: int64

s = pd.Series(['dog', 'goat', 'skunk'])
s.map(lambda s:len(s))

0    3
1    4
2    5
dtype: int64

s = pd.Series(['fruit', 'animal', 'animal', 'fruit', 'fruit'], index=['apple', 'cat', 'goat', 'banana', 'kiwi'])
s

apple      fruit
cat       animal
goat      animal
banana     fruit
kiwi       fruit
dtype: object

t = pd.Series({'fruit':0, 'animal':1})
t

fruit     0
animal    1
dtype: int64

s.map(t)

apple     0
cat       1
goat      1
banana    0
kiwi      0
dtype: int64

# 从字典创建DataFrame，键成为列名。值成为字典的行。
# 对于给定列未指定的行，接收NaN。
d = {'A':pd.Series([1,2,3], index=['cat','dog','bird']),
    'B':{'cat':3.14, 'dog':2.718, 'bird':1.618, 'goat':0.5772}}
df = pd.DataFrame(d)
df

d = {'Undergrad' : pd.Series(['UMich', 'Stanford', 'Princeton', 'Columbia'],
                             index=['Ford', 'Hoover', 'Wilson', 'Obama']),
     'PhD' : {'Wilson':'Johns Hopkins'},
     'JD' : {'Ford':'Yale','Obama':'Harvard'},
     'Terms': pd.Series([1,1,2,2]) }
presidents = pd.DataFrame(d)
presidents

presidents.columns

Index(['Undergrad', 'PhD', 'JD', 'Terms'], dtype='object')

presidents.index

Index(['Ford', 'Hoover', 'Obama', 'Wilson', 0, 1, 2, 3], dtype='object')

d = {'Undergrad' : pd.Series(['UMich', 'Stanford', 'Princeton', 'Columbia'],
                             index=['Ford', 'Hoover', 'Wilson', 'Obama']),
     'PhD' : {'Wilson':'Johns Hopkins'},
     'JD' : {'Ford':'Yale','Obama':'Harvard'},
     'Terms': pd.Series([1,1,2,2], index=['Ford', 'Hoover', 'Wilson', 'Obama']) }
presidents = pd.DataFrame(d)
presidents['Nobels'] = [0,0,1,1]
presidents

# 标量在行中传播播。
presidents['Fields Medals'] = 0
presidents

del presidents['Fields Medals']
presidents

# df.loc按其行标签选择行。df.iloc按其整数行标签（从0开始）选择行。
presidents.loc['Obama']

Undergrad    Columbia
PhD               NaN
JD            Harvard
Terms               2
Nobels              1
Name: Obama, dtype: object

presidents.iloc[2]

Undergrad    Columbia
PhD               NaN
JD            Harvard
Terms               2
Nobels              1
Name: Obama, dtype: object

# 多行
presidents.iloc[1:3], presidents[1:3]

(       Undergrad  PhD       JD  Terms  Nobels
 Hoover  Stanford  NaN      NaN      1       0
 Obama   Columbia  NaN  Harvard      2       1,
        Undergrad  PhD       JD  Terms  Nobels
 Hoover  Stanford  NaN      NaN      1       0
 Obama   Columbia  NaN  Harvard      2       1)

# 按其名称选择列。
presidents['Undergrad']

Ford          UMich
Hoover     Stanford
Obama      Columbia
Wilson    Princeton
Name: Undergrad, dtype: object

# 获得多列
presidents[['Undergrad', 'PhD']]

# 布尔表达式选择行
type(presidents['Terms']<2), presidents[presidents['Terms']<2]

(pandas.core.series.Series,
        Undergrad  PhD    JD  Terms  Nobels
 Ford       UMich  NaN  Yale      1       0
 Hoover  Stanford  NaN   NaN      1       0)

df1 = pd.DataFrame(np.random.randn(8, 4), columns=['A', 'B', 'C', 'D'])
df2 = pd.DataFrame(np.random.randn(5, 3), columns=['A', 'B', 'C'])
df1+df2

df = pd.DataFrame(np.random.randn(4, 2), columns=['A', 'B'])
df

df - df.iloc[0]

print((df>0).any())
print('*************************')
print((df>0).all())

A    True
B    True
dtype: bool
*************************
A    False
B    False
dtype: bool

# values属性将dataframe中的元素存储在一个numpy数组中。
df.values

array([[-0.45820979,  0.66001809],
       [ 1.58608942,  0.26623444],
       [ 0.2063507 ,  1.0656545 ],
       [-0.45006761, -0.27407931]])

df.T @ df

df * df

presidents.dropna(axis=0)

presidents.dropna(axis=1)

baseball = pd.read_csv("baseball.csv")
# head()方法显示DataFrame的前几行）。tail()显示最后几行。
baseball.head(5)

df.mean(axis=0) # 列的均值

A    0.221041
B    0.429457
dtype: float64

df.mean(axis=1) # 行的均值

0    0.100904
1    0.926162
2    0.636003
3   -0.362073
dtype: float64

df.apply(np.mean, axis=0)

A    0.221041
B    0.429457
dtype: float64

df.apply(np.mean, axis=1)

0    0.100904
1    0.926162
2    0.636003
3   -0.362073
dtype: float64

df.apply(np.exp)

tsdf = pd.DataFrame(np.random.randn(10, 3),
                    columns=['DOW', 'NASDAQ', 'S&P500'],
                    index=pd.date_range('1/1/2000', periods=10))
tsdf.head()

import warnings
warnings.filterwarnings('ignore')
tsdf.agg([np.median, np.mean, np.std])

# agg()可以接受一个字典，其键是列名，值是函数。
tsdf.agg({'DOW':'mean',
          'NASDAQ':'median',
          'S&P500':'max'})

DOW      -0.171084
NASDAQ    0.238541
S&P500    0.667303
dtype: float64

df = pd.DataFrame({'A': ['cat', 'dog', 'bird'],
                   'B': ['unicorn','chupacabra','pixie']})
df.applymap(lambda s: s.upper())

s

apple      fruit
cat       animal
goat      animal
banana     fruit
kiwi       fruit
dtype: object

for i in s:
    print(i)

fruit
animal
animal
fruit
fruit

tsdf.head()

for i in tsdf:
    print(i)

DOW
NASDAQ
S&P500

s1 = pd.Series(np.random.randn(1000))
s2 = pd.Series(np.random.randn(1000))
s1.cov(s2)

-0.003137610481467702

tsdf.head(3)

tsdf.cov()

tsdf.corr(method='spearman')

import pandas as pd
import numpy as np
s = pd.Series(np.random.randn(5), index=list('abcde'))
s

a   -1.178760
b    0.219868
c    0.100833
d    0.780885
e    2.962853
dtype: float64

s.rank()

a    1.0
b    3.0
c    2.0
d    4.0
e    5.0
dtype: float64

tsdf.rank(axis=0)

tsdf.rank(axis=1)

df = pd.DataFrame({'A' : ['plant', 'animal', 'plant', 'plant'],
                   'B' : ['apple', 'goat', 'kiwi', 'grape'],
                   'C' : np.random.randn(4),
                   'D' : np.random.randn(4)})
df

df.groupby('A')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000178706A9790>

# 在这个例子中，我们根据列‘A’进行拆分，
# 它有两个值：‘plant’和‘animal’,
# 所以groups字典有两个键。
df.groupby('A').groups

{'animal': [1], 'plant': [0, 2, 3]}

# 在组‘A’上拆分，然后在每个组内计算平均值。
# 注意，对于不支持平均值的列将被移除，所以列‘B’不会出现在结果中。
df.groupby('A')[['C', 'D']].mean()

# 构建一个层次化索引的Series（即，多索引），记录（虚构的）按专业和惯用手的学生分数。
arrs = [['math', 'math', 'econ', 'econ', 'stats', 'stats'],
          ['left', 'right', 'left', 'right', 'left', 'right']]
index = pd.MultiIndex.from_arrays(arrs, names=['major', 'handedness'])
s = pd.Series(np.random.randn(6), index=index)
s

major  handedness
math   left         -2.053607
       right        -0.469359
econ   left          0.678029
       right         0.947858
stats  left          0.273756
       right         0.061540
dtype: float64

# 假设我想在忽略惯用手的情况下按专业得到平均分数。
# 按层次结构的0级（即，‘major’）进行分组，并取平均值
s.groupby('major').mean()

major
econ     0.812944
math    -1.261483
stats    0.167648
dtype: float64

import pandas as pd
df = pd.DataFrame({
    'A': ['a', 'a', 'b', 'b', 'c', 'c'],
    'B': [1, 3, 3, 4, 5, 6],
})
zscore = lambda x: (x - x.mean()) / x.std()

# 对数据框架进行分组，并对分组后的数据进行转换
df['B_mean'] = df.groupby('A')['B'].transform('mean')
df['B_std'] = df.groupby('A')['B'].transform('std')
df['Z_Score'] = df.groupby('A')['B'].transform(zscore)

print(df)

   A  B  B_mean     B_std   Z_Score
0  a  1     2.0  1.414214 -0.707107
1  a  3     2.0  1.414214  0.707107
2  b  3     3.5  0.707107 -0.707107
3  b  4     3.5  0.707107  0.707107
4  c  5     5.5  0.707107 -0.707107
5  c  6     5.5  0.707107  0.707107

# * 所以这将丢弃所有均值小于等于2的组
df_filtered = df.groupby('A')['B'].filter(lambda x: x.mean()>2)
df_filtered

2    3
3    4
4    5
5    6
Name: B, dtype: int64

df1 = pd.DataFrame({'A':np.random.randn(4),
                    'B':np.random.randn(4),
                    'C':np.random.randn(4)},
                   index=[0,1,2,3])
df2 = pd.DataFrame({'A':np.random.randn(4),
                    'B':np.random.randn(4),
                    'C':np.random.randn(4)},
                   index=[4,5,6,7])
pd.concat([df1,df2])

df1 = pd.DataFrame({'A':np.random.randn(4),
                    'B':np.random.randn(4),
                    'C':np.random.randn(4)},
                   index=[0,1,2,3])
df2 = pd.DataFrame({'A':np.random.randn(4),
                    'B':np.random.randn(4)},
                   index=[3,4,5,6])
pd.concat([df1,df2])

# merge 函数用于合并两个数据框架，基于一个或多个公共列。它可以进行内连接、左连接、右连接和全外连接。
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
df_merge = pd.merge(df1, df2, on='key', how='right')
print(df_merge)

  key  value1  value2
0   A     1.0       4
1   B     2.0       5
2   D     NaN       6

# join 函数用于合并两个数据框架，基于索引。
import pandas as pd
df1 = pd.DataFrame({'value1': [1, 2, 3]}, index=['A', 'B', 'C'])
df2 = pd.DataFrame({'value2': [4, 5, 6]}, index=['A', 'B', 'D'])
df_join = df1.join(df2, how='outer')
print(df_join)

   value1  value2
A     1.0     4.0
B     2.0     5.0
C     3.0     NaN
D     NaN     6.0

# 这种格式的数据通常被称为堆叠。
# 通常在文件中以这种形式存储数据
# 但一旦读入表格，通常更有意义的是为2018和2019创建列。也就是说，我们希望将这个DataFrame解堆叠
data = {'Name': ['Alice', 'Bob', 'Alice', 'Bob'],
        'Year': [2018, 2018, 2019, 2019],
        'Sales': [100, 200, 150, 250],
        'Salary': [3000, 20000, 15000, 5500]}
df = pd.DataFrame(data)
df

# 使用 pivot 函数进行数据转换
df_pivot = df.pivot(index='Name', columns='Year', values=['Sales', 'Salary'])
print(df_pivot)

      Sales      Salary       
Year   2018 2019   2018   2019
Name                          
Alice   100  150   3000  15000
Bob     200  250  20000   5500

df_stack = df_pivot.stack()
df_stack

df_pivot.index, df_pivot.columns, df_stack.index, df_stack.columns

(Index(['Alice', 'Bob'], dtype='object', name='Name'),
 MultiIndex([( 'Sales', 2018),
             ( 'Sales', 2019),
             ('Salary', 2018),
             ('Salary', 2019)],
            names=[None, 'Year']),
 MultiIndex([('Alice', 2018),
             ('Alice', 2019),
             (  'Bob', 2018),
             (  'Bob', 2019)],
            names=['Name', 'Year']),
 Index(['Sales', 'Salary'], dtype='object'))

df = pd.DataFrame(np.random.randn(1000, 3),
                  index=pd.date_range('1/1/2000', periods=1000),
                  columns=['cat','bird','goat'])
df_cumsum = df.cumsum()
_ = df_cumsum.plot()

	A	B
0	0.000000	0.000000
1	2.044299	-0.393784
2	0.664560	0.405636
3	0.008142	-0.934097

	A	B
A	2.970777	0.463098
B	0.463098	1.717244

	A	B
0	0.209956	0.435624
1	2.515680	0.070881
2	0.042581	1.135620
3	0.202561	0.075119

	A	B
0	0.632415	1.934827
1	4.884610	1.305041
2	1.229184	2.902738
3	0.637585	0.760272

	DOW	NASDAQ	S&P500
2000-01-01	-1.111677	0.768413	-0.143206
2000-01-02	0.009493	0.068919	0.430048
2000-01-03	-1.230109	0.183881	-2.149546
2000-01-04	0.307812	-0.182283	0.667303
2000-01-05	-1.371142	1.114009	-1.048906

Python数据处理¶

05. pandas ¶

Pandas¶

安装pandas¶

基本数据结构¶

pandas：Series¶

pandas：Series¶

pandas: Series¶

pandas:Series¶

pandas: Series¶

Mapping and linking Series values¶

pandas: DataFrames¶

pandas DataFrames：行/列名称¶

pandas DataFrames：访问/添加列¶

pandas DataFrames：删除列¶

pandas DataFrames：索引和选择¶

DataFrame的算术¶

DataFrame的算术¶

DataFrame的算术¶

删除NaN¶

读写文件¶

总结DataFrame¶

DataFrame上的统计操作¶

按行和列函数：apply()¶

聚合数据¶

逐元素函数应用¶

Series和DataFrames的迭代¶

计算协方差¶

成对相关性¶

对数据进行排名¶

分组(group by)：重新组织数据¶

分组：聚合¶

转换数据¶

过滤数据¶

合并DataFrames¶

Merge and Join¶

Pivoting and Stacking (解堆叠和堆叠)¶

绘制DataFrames¶

	Undergrad	PhD	JD	Terms
Ford	UMich	NaN	Yale	NaN
Hoover	Stanford	NaN	NaN	NaN
Obama	Columbia	NaN	Harvard	NaN
Wilson	Princeton	Johns Hopkins	NaN	NaN
0	NaN	NaN	NaN	1.0
1	NaN	NaN	NaN	1.0
2	NaN	NaN	NaN	2.0
3	NaN	NaN	NaN	2.0

	A	B	C	D
0	3.674877	1.013402	2.442386	NaN
1	-0.224082	0.658016	-0.229887	NaN
2	1.283483	-0.970610	-0.261253	NaN
3	-2.611369	1.511940	1.579408	NaN
4	0.354213	-1.356983	0.381391	NaN
5	NaN	NaN	NaN	NaN
6	NaN	NaN	NaN	NaN
7	NaN	NaN	NaN	NaN

	A	B
0	-0.458210	0.660018
1	1.586089	0.266234
2	0.206351	1.065654
3	-0.450068	-0.274079

	id	year	stint	team	lg	g	ab	r	h	X2b	...	rbi	sb	cs	bb	so	ibb	hbp	sh	sf	gidp
4	ansonca01	1871	1	RC1	NaN	25	120	29	39	11	...	16.0	6.0	2.0	2	1.0	NaN	NaN	NaN	NaN	NaN
44	forceda01	1871	1	WS3	NaN	32	162	45	45	9	...	29.0	8.0	0.0	4	0.0	NaN	NaN	NaN	NaN	NaN
68	mathebo01	1871	1	FW1	NaN	19	89	15	24	3	...	10.0	2.0	1.0	2	0.0	NaN	NaN	NaN	NaN	NaN
99	startjo01	1871	1	NY2	NaN	33	161	35	58	5	...	34.0	4.0	2.0	3	0.0	NaN	NaN	NaN	NaN	NaN
102	suttoez01	1871	1	CL1	NaN	29	128	35	45	3	...	23.0	3.0	1.0	1	0.0	NaN	NaN	NaN	NaN	NaN

	DOW	NASDAQ	S&P500
median	-0.104625	0.238541	-0.465609
mean	-0.171084	0.181594	-0.558840
std	0.885533	0.824707	0.918165

	DOW	NASDAQ	S&P500
DOW	0.784169	-0.374583	0.382336
NASDAQ	-0.374583	0.680142	-0.046436
S&P500	0.382336	-0.046436	0.843026

	DOW	NASDAQ	S&P500
DOW	1.000000	-0.321212	0.563636
NASDAQ	-0.321212	1.000000	-0.151515
S&P500	0.563636	-0.151515	1.000000

	A	B	C	D
0	plant	apple	-0.585502	0.214857
1	animal	goat	-1.446204	-1.042365
2	plant	kiwi	0.774599	-0.545185
3	plant	grape	-0.851527	-1.206412

	A	B	C
0	-1.649543	0.107785	-1.594782
1	0.310221	1.527917	-0.055973
2	0.500854	0.712681	1.603179
3	0.258046	-0.065081	2.057852
4	0.926570	1.890665	-0.515913
5	-0.016323	-0.290326	-1.673060
6	0.635052	-1.612512	-1.477821
7	0.117893	1.098047	0.127505

	A	B	C
0	-1.567082	1.278266	-0.632878
1	1.565493	0.877441	-1.214055
2	-1.428877	-1.111943	-1.400299
3	1.071105	-0.693203	1.313127
3	-0.363177	0.276126	NaN
4	1.417993	-1.492555	NaN
5	1.333742	0.252255	NaN
6	-0.164822	-0.347508	NaN

	A	B
0	CAT	UNICORN
1	DOG	CHUPACABRA
2	BIRD	PIXIE

	Name	Year	Sales	Salary
0	Alice	2018	100	3000
1	Bob	2018	200	20000
2	Alice	2019	150	15000
3	Bob	2019	250	5500

		Sales	Salary
Name	Year
Alice	2018	100	3000
Alice	2019	150	15000
Bob	2018	200	20000
Bob	2019	250	5500