Pandas
Pandas
Here are some tutorials and examples :
Before you start, It's important to import numpy
and pandas
in your code.
import numpy as np
import pandas as pd
Object creation
Series 是 Pandas 的一種 Data structure 用來呈現 one column 的 elements
pd.Series([1, 2, np.nan, 3])
# 0 1.0
# 1 2.0
# 2 NaN
# 3 3.0
# dtype: float64
DataFrame 則是 Pandas 用來呈現 2D elements 的 Data strucutre
df = pd.DataFrame(np.random.randn(4, 4), index=list("1234"), columns=list("ABCD"))
# A B C D
# 1 1.867558 -0.977278 0.950088 -0.151357
# 2 -0.103219 0.410599 0.144044 1.454274
# 3 0.761038 0.121675 0.443863 0.333674
# 4 1.494079 -0.205158 0.313068 -0.854096
df = pd.DataFrame({"A": 1., "B": "foo", "C": pd.Series([1, 2, 3, 4])})
# A B C
# 0 1.0 foo 1
# 1 1.0 foo 2
# 2 1.0 foo 3
# 3 1.0 foo 4
Viewing Data
要讀取 table 的頭尾可以利用 head 和 tail 函式
另外也可以列出所有的 rows 和 columns
df.head(1)
# A B C
# 0 1.0 foo 1
df.tail(2)
# A B C
# 2 1.0 foo 3
# 3 1.0 foo 4
df.index
# RangeIndex(start=0, stop=4, step=1)
df.columns
# Index(['A', 'B', 'C'], dtype='object')
其他的還有查看 table 的 describe (mean, std, min, max, ...)
轉置 table,或是透過特定 axis 或特定 columns 來 sorting table
df.describe()
# A C
# count 4.0 4.000000
# mean 1.0 2.500000
# std 0.0 1.290994
# min 1.0 1.000000
# 25% 1.0 1.750000
# 50% 1.0 2.500000
# 75% 1.0 3.250000
# max 1.0 4.000000
df.T # Transpose
df.sort_index(axis=1, ascending=false) # Sort by axis 1
df.sort_values(by='B') # Sort by column 'B'
Selection
想要只看單一 column 或是一個範圍內的 data :
df['A'] # 只看 A columns
df[0:3] # 只看 0, 1, 2 rows
df['0': '2'] # 只看 row 名稱為 0 到 row 名稱為 2 的 rows
使用 loc 可以做很多事情
# 只看單一 row 的內容,會以 Series 顯示
df.loc[1]
# A 1
# B foo
# C 2
# 只看任意幾個 columns
df.loc[:, ['A', 'C']]
# A C
# 0 1.0 1
# 1 1.0 2
# 2 1.0 3
# 3 1.0 4
# 只看任意 rows 和 columns 的組合
df.loc[0:2, ['A', 'C']]
# 只看單 row 的某幾個 columns (Series 顯示)
df.loc[3, ['A', 'C']]
# 鎖定某一個點 (row, column) 會以值顯示
df.loc[0, 'A']
iloc 則是 loc 的另一種方式,是使用 int position 作為 parameters
df.iloc[3] # 第 3 row
df.iloc[3:5, 0:2] # 3, 4 rows 的 0, 1 columns
df.iloc[1:3, :] # 1, 2 rows 的 all columns
df.iloc[[1, 2, 4], [0, 2]] # 1, 2, 4 rows 的 0, 2 columns
df.iloc[1, 1] # item at (1, 1)
以下是 boolean indexing 的方式
df[df.A > 0] # 找出 column A 所有大於 0 的 Data frame
df2[df2['E'].isin(['two', 'four'])] # 找出 column E 的值包含 two 或 four 的 Data frame
Settings
pd.Series([1, 2, 3], index=pd.date_range('20100101', period=3)) # 可以指派想要的 index
df.at[dates[0], 'A'] = 0 # 利用 label 指派數值
df.iat[0, 1] = 0 # 利用 position 指派數值
df.loc[:, 'D'] = np.array([5] * len(df)) # 指派整個 numpy array
df2[df2 > 0] = -df2 # 修改所有 value
Missing Data
接著有一些處理 missing data (NaN) 的方法
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
# reindex 可以 change/add/delete 特定 axis 的 index
df1.loc[dates[0]:dates[1], 'E'] = 1
# A B C D F E
# 013-01-01 0.000000 0.000000 -1.509059 5 NaN 1.0
# 013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0
# 013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 NaN
# 013-01-04 0.721555 -0.706771 -1.039575 5 3.0 NaN
df1.dropna(how='any') # 把包含 NaN 的 rows 全刪掉
df1.fillna(value=5) # 把 NaN 全填成 5
pd.isna(df1) # 把 table 變成 True/False table (NaN = True)
Apply function
df.apply(np.cumsum)
df.apply(lambda x: x.max() - x.min())
Histogramming
s = pd.Series(np.random.randint(0, 7, size=10))
# 0 4
# 1 2
# 2 1
# 3 2
# 4 6
# 5 4
# 6 4
# 7 6
# 8 4
# 9 4
s.value_counts()
# 4 5
# 6 2
# 2 2
# 1 1
Merge
Concat 可以將 objects 組合起來
pieces = [df[:3], df[3:7], df[7:]]
pd.concat(pieces)
也可以用 SQL-like 的 merge (join) 方法
left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})
# key lval
# 0 foo 1
# 1 foo 2
right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})
# key rval
# 0 foo 4
# 1 foo 5
pd.merge(left, right, on='key')
# key lval rval
# 0 foo 1 4
# 1 foo 1 5
# 2 foo 2 4
# 3 foo 2 5
另一個 Merge 的 example
left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})
# key lval
# 0 foo 1
# 1 bar 2
right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})
# key rval
# 0 foo 4
# 1 bar 5
pd.merge(left, right, on='key')
# key lval rval
# 0 foo 1 4
# 1 bar 2 5
最後還有 append
df.append(s, ignore_index=True)
Grouping
"group by" 可以細分成以下三個 steps
Splitting the data into groups based on some criteria
Applying a function to each group independently
Combining the results into a data structure
# A B C D
# 0 foo one -1.202872 -0.055224
# 1 bar one -1.814470 2.395985
# 2 foo two 1.018601 1.552825
# 3 bar three -0.595447 0.166599
# 4 foo two 1.395433 0.047609
# 5 bar two -0.392670 -0.136473
# 6 foo one 0.007207 -0.561757
# 7 foo three 1.928123 -1.623033
df.groupby('A').sum()
# C D
# A
# bar -2.802588 2.42611
# foo 3.146492 -0.63958
df.groupby(['A', 'B']).sum()
# C D
# A B
# bar one -1.814470 2.395985
# three -0.595447 0.166599
# two -0.392670 -0.136473
# foo one -1.195665 -0.616981
# three 1.928123 -1.623033
# two 2.414034 1.600434
Reshaping
The stack() method “compresses” a level in the DataFrame’s columns.
# A B
# first second
# bar one 0.029399 -0.542108
# two 0.282696 -0.087302
# baz one -1.575170 1.771208
# two 0.816482 1.100230
stacked = df.stack()
# first second
# bar one A 0.029399
# B -0.542108
# two A 0.282696
# B -0.087302
# baz one A -1.575170
# B 1.771208
# two A 0.816482
# B 1.100230
The inverse operation of stack() is unstack(), which by default unstacks the last level:
stacked.unstack()
# A B
# first second
# bar one 0.029399 -0.542108
# two 0.282696 -0.087302
# baz one -1.575170 1.771208
# two 0.816482 1.100230
stacked.unstack(1)
# second one two
# first
# bar A 0.029399 0.282696
# B -0.542108 -0.087302
# baz A -1.575170 0.816482
# B 1.771208 1.100230
Categoricals
Pandas 能夠將資料分類
df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
df["grade"] = df["raw_grade"].astype("category")
# 0 a
# 1 b
# 2 b
# 3 a
# 4 a
# 5 e
# Name: grade, dtype: category
# Categories (3, object): [a, b, e]
也能將分類重新命名
df["grade"].cat.categories = ["very good", "good", "very bad"]
# 0 very good
# 1 good
# 2 good
# 3 very good
# 4 very good
# 5 very bad
# Name: grade, dtype: category
# Categories (3, object): [very good, good, very bad]
就可以依分類做 sort 或 groupby
df.sort_values(by="grade")
# id raw_grade grade
# 5 6 e very bad
# 1 2 b good
# 2 3 b good
# 0 1 a very good
# 3 4 a very good
# 4 5 a very good
df.groupby("grade").size()
# grade
# very bad 1
# bad 0
# medium 0
# good 2
# very good 3
# dtype: int64
Plotting
import matplotlib.pyplot as plt
ts = pd.Series(np.random.randn(1000),
index=pd.date_range('1/1/2000', periods=1000))
ts = ts.cumsum()
ts.plot()
plt.show()

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
columns=['A', 'B', 'C', 'D'])
df = df.cumsum()
# A B C D
# 2000-01-01 1.137372 -0.667626 0.120983 -0.568837
# 2000-01-02 0.185677 -1.196407 0.655425 -1.207966
# 2000-01-03 0.284604 -1.341850 -1.269046 -1.799623
# 2000-01-04 2.061235 -0.904345 -0.584501 -2.260388
# 2000-01-05 2.838373 -1.657153 -0.439002 -1.864843
# ... ... ... ... ...
# 2002-09-22 -46.196742 -11.879780 22.752095 -15.641322
# 2002-09-23 -47.016031 -10.803038 23.834051 -14.173797
# 2002-09-24 -45.673721 -12.083731 24.443607 -11.324811
# 2002-09-25 -46.164865 -13.052021 23.609504 -12.678023
# 2002-09-26 -46.298883 -13.105695 23.919682 -12.888132
# [1000 rows x 4 columns]
df.plot()
plt.show()
plt.legend(loc='best')

Getting Data In/Out
讀取 csv 檔的方法
輸出 csv 檔的方法
pd.read_csv('foo.csv')
df.to_csv('foo.csv')
Excel
pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])
df.to_excel('foo.xlsx', sheet_name='Sheet1')
Last updated
Was this helpful?