Python3 数据科学入门
环境搭建
数据科学工作流
- Inquire
- Obtain
- Scrub
- Explore
- Model
- Interpret
Anaconda 和 Jupyter notebook
conda:可扩展的包管理工具
- 安装
- 更新
- 创建沙盒
沙盒操作
conda create --name python37 python=3.7 # 创建新的环境
activate python37 # 激活一个环境
deactivate python37 # 退出一个环境
conda remove --name python37 --all # 删除一个环境
Conda包管理
conda install numpy # 安装包
conda list #查看已安装的包 conda list -n python37 # 查看指定环境已安装的包
conda remove -n python37 numpy
Jupyter的使用
Numpy
数据科学领域5个最常用的库
Numpy
- N维数组(矩阵),高效的index,不需要循环;
- 开源免费,运行效率足以和C/Matlab媲美
Scipy
- 依赖Numpy
- 专为科学和工程设计
- 实现了多种常用科学计算:线性代数、傅里叶变换、信号和图像处理
Pandas
- 结构化数据分析利器(依赖Numpy)
- 提供了多种高级数据结构:Time-Series, DataFrame, Panel
- 强大的数据索引和处理能力
Matplotlib
- Python 2D绘图领域最广泛的套件
- 基本能取代Matlab的绘图功能
- 通过mplot3d可以绘制精美的3D图
Scikit-learn
- 机器学习的Python模块
- 建立在Scipy之上
- 简单易学的API接口
矩阵运算
数学知识,同线性代数里面的一样。
创建Array
import numpy as np
list_1 = [1 , 2, 3, 4] array_1 = np.array(list_1)
list_2 = [5, 6, 7, 8] array_2 = np.array([list_1, list_2]) # 二维数组
array_2.shape # m * n array_2.size # array的数组 array_2.dtype # array里面元素的数据类型, 不一致取精度最高的
array_4 = np.arange(1, 10) # 左闭右开
array_5 = np.arange(1 ,10, 2) # 间距为2
np.zeros(5) # 1*5全零矩阵 np.zeros(2, 3) # 2*3全零矩阵
array[0] array[1:5] array[1][0] array[:2, 1:]
数组操作
import numpy as np
np.random.randn(10) np.random.randint(10, size = (2,3))
# reshape改变数组形状 np.zeros(20).reshape(4,5)
# 数组加减乘除 a + b a - b a * b a / b
矩阵操作
np.mat([[1, 2, 3], [4, 5, 6]])
# 将array转为矩阵 np.mat(array)
A = np.mat(a) B = np.mat(b) A + B A * B # A的列数要等于B的行数
Array常用函数
np.unique(a)
sum(a) sum(a[:, 0]) # 第一列的和
a.max() max(a[0]) max(a[:, 0])
Array的input和output
传统python序列化
import pickle import numpy as np
x = np.arange(10) f = open('x.pkl', 'wb') pickle.dump(x,f) f.close()
f = open('x.pkl', 'rb') pickle.load(f) f.close()
numpy提供的序列化
np.save('one_array', x) np.load('one_array.npy')
保存多个数组到一个目录
np.savez('two_array.npz', a = x, b = y) c = np.load('two_array.npz') c['a'] c['b']
Pandas
Series
创建
import numpy as np import pandas as pd
# list s1 = pd.Series([1, 2, 3, 4]) s1.values s1.index
# np array s2 = pd.Series(np.arange(10))
# dict s3 = pd.Series({"1" : 1, "2" : 2, "3" : 3})
# 指定index s4 = pd.Series([1, 2, 3, 4], index = ["A", "B", "C", "D"]) s4.values s4.index
操作
s4["A"] s4[s4 > 2]
# 转化为py的字典 s4.to_dict() s5 = pd.Series(s4.to_dict())
index_1 = ["A", "B", "C", "D", "E"] s6 = pd.Series(s5, index = index_1) # E NaN pd.isnull(s6) # E True pd.notnull(s6) # E False
s6.name = "demo" s6.index.name = "demo index"
DataFrame
import numpy as np import pandas as pd from pandas import Series, DataFrame
import webbrower link = 'https://www.tiobe.com/tiobe-index/' webbrower.open(link)
# 从剪切板创建 df = pd.read_clipcoard() type(df) df.columns df_new = DataFrame(df, columns = ['Sep 2016', 'Programming Language']) df['Sep 2016'] type(df['Sep 2016']) # Series
df_new['Sep 2018'] = np.arange(10) df_new['Sep 2018'] = pd.Series(np.arange(10)) df_new['Sep 2018'] = pd.Series([100, 200], index = [1, 2])
深入理解Series和DataFrame
Series
data = { 'Country': ['Belgium', 'India', 'Brazil'], 'Captial': ['Brussels', 'New Delhi', 'Brasilia'], 'Population': [11190846, 1303171035, 207847528] }
s1 = pd.Series(data['Country'], index = ['A', 'B', 'C']) s1.values s1.index
DataFrame
df1 = pd.DataFrame(data) df1['Country'] type(df1['Country']) # Series
# 访问某一行 for raw in df1.iterrows() : print(row) print(type(row)) # tuple print(row[0]) # index print(row[1]) # Series pass
# 由Series创建DataFrame s1 = pd.Series(data['Capital']) s2 = pd.Series(data['Country']) s2 = pd.Series(data['Population']) df_new = pd.DataFrame([s1, s2, s3], index=[...]) df_new = df_new.T # 转置
DataFrame的IO操作
df.read_clipboard() df.to_clipboard()
df.read_csv('df.csv') df.to_csv('df.csv')
df.read_json(jsonVar) df.to_json()
df.read_html('df.html') df.to_html('df.html')
df.read_excel('df.xlsx') df.to_excel('df.xlsx')
DataFrame的Selecting和Indexing
indb = pd.read_csv('xxx.csv') indb.head() # 前5行 indb.head(10) # 10行 indb.tail() # 后5行 indb.tail(10) # 10行
sub_df = indb[['director_name', 'movie_title', 'indb_score']] sub_df.iloc[10:20; :] # 10-20行(左闭右开),所有列 sub_df.iloc[10:20; 0:2 ] # ... 0-2列(左闭右开) 基于位置信息,与名字无关
indb.loc[15:17, : 'movie_title'] # 15,17不是index,是label,15和17都包含
Reindex
series reindex
shift + tab Jupyter显示帮助
s1 = Series([1, 2, 3, 4], index = ['A', 'B', 'C', 'D']) s1.reindex(index = ['A', 'B', 'C', 'D', 'E']) # 新的index对应的值为NaN s1.reindex(index = ['A', 'B', 'C', 'D', 'E'], fill_value = 10) # 新的index对应的值为10 s1.reindex(index = ['A', 'B'] # C D E被删掉了 s1.drop('A') # 删掉A
s2 = Series(['A', 'B', 'C'], index = [1, 5, 10]) s2.reindex(index=range(15)) # 1:A 5:B 10:C 其余index对应的是NaN s2.reindex(index=range(15), method = 'fill') # 0:NaN 1-4:A 5-9:B 10-15:C
DataFrame Reindex
df1 = DataFrame(np.random.rand(25).reshape([5, 5]), index = ['A', 'B', 'D' ,'E', 'F'], columns = ['c1', 'c2', 'c3', 'c4', 'c5']) df1.reindex(index=['A','B','C', 'D', 'E', 'F'], columns = ['c1', 'c2', 'c3', 'c4', 'c5', 'c6']) # C:NaN c6:NaN df1.drop('A', axis = 0) # 删掉A axio=0是index axios=1是columns
NaN
NaN: Not a Number
n = np.nan # 创建类型为NaN的数据类型 type(n) # float m = 1 m + n # nan
s1 = Series([1, 2, np.nan, 3, 4], index = ['A', 'B', 'C', 'D', 'E']) s1.isnull() # 返回新的Series C:True 其它的对应为False s1.notnull() # 同上面相反 s1.dropna() # 将value是NaN的drop掉
df.isnull() # value是NaN的就是True,不是就是False df.notnull() # 同上面相反 df.dropna(axios=0) # 删掉行里面有NaN的行(index) axios=1(列) df.dropna(axios=0, how="any") # 只要一行有NaN就会被删 df.dropna(axios=0, how="all") # 一行全部都是NaN才会被删 df.dropna(axios=0, thresh=2) # 这一行的NaN大于2就会被删 df.fillna(value=1) # 将df里面的NaN替换为1 df.fillna(value={0: 0, 1:1, 2:2, 3:3}) # 第一列NaN的填0 第二列NaN的填1 ...
多级index
多级index Series
s1 = Series(np.random.randn(6), index = [['1', '1', '1', '2','2' , '2'],['a', 'b', 'c', 'a', 'b', 'c']]) s['1'] # 是一个Series 有a b c三个index s['2'] # 是一个Series 有a b c三个index s['1']['a'] s['2']['a'] s1[:, 'a'] # 一级index随意 二级index是'a' 返回一个Series
多级index Series转换为DataFrame
df1 = s1.unstack() df2 = DataFrame([s1['1'], s1['2']])
DataFrame转换为多级index Series
s2 = df1.unstack() s2 = df1.T.unstack()
多级index DataFrame
df = DataFrame(np.arange(16).reshape(4, 4), index = [['a', 'a', 'b', 'b'], [1, 2, 1, 2]], columns=[['BJ', 'BJ', 'SH', 'GZ'], [8, 9, 8, 8]]) df['BJ'] # 返回DataFrame df['BJ'][8]
Maping和Replace
df1 = DataFrame({"城市": ["BJ", "SH", "GZ"], "人口": [1000, 2000, 1500]}) df1['GDP'] = Series([1000, 2000, 1500])
df2 = DataFrame({"城市": ["BJ", "SH", "GZ"], "人口": [1000, 2000, 1500]}) gdp_map = {"BJ": 1000, "SH": 2000, "GZ": 1500} df2['GDP'] = df1['城市'].map(gdp_map)
s1 = Series(np.arange(10)) s1.replace(1, np.nan) # 返回新的Series s1.replace({1: np.nan}) s1.replace([1, 2, 3], [10, 20, 30])
Pandas玩转数据
Series和DataFrame简单数***算
s1 = Series([1, 2, 3], index=['A', 'B', 'C']) s2 = Series([4, 5, 6, 7], index=['B', 'C', 'D', 'E']) s1 + s2 # A:NaN D,E:NaN
df1 = DataFrame(np.arange(4).reshape(2, 2), index = ['A', 'B'], columns=['BJ', 'SH']) df2 = DataFrame(np.arange(9).reshape(3, 3), index = ['A', 'B', 'C'], columns=['BJ', 'SH', 'GZ']) df3 = df1 + df2 df3.sum() df3.min() df3.max() df3.describe() # 返回统计的信息 mean std ...
Series和DataFrame的排序
s1 = Series(np.random.randn(10)) s1.values s1.index
s2 = s1.sort_values() s2 = s1.sort_values(ascending = False) # 降序
s2= s1.sort_index()
df1 = DataFrame(np.random.randn(40).reshape(8, 5), columns=['A', 'B', 'C', 'D', 'E']) df1['A'].sort_values() # Series df2 = df1.sort_values('A') df2.sort_index()
重命名DataFrame的Index
df1 = DataFrame(np.arange(9).reshape(3, 3), index = ['BJ', 'SH', 'GZ'], columns = ['A', 'B', 'C']) df1.index = Series(['bj', 'sh', 'gz']) df1.inedx = df1.index.map(str.upper) df1 = df1.rename(index = str.lower, columns = str.lower) df1 = df1.rename(index = {'bj': 'beijing'}, columns = {'a': 'A'})
list = [1, 2, 3, 4] list2 = [str(x) for x in list] # ['1', '2', '3', '4'] list3 = list(map(str, list))
def test_map(x) : return x + '_ABC' df1.index.map(test_map)
DataFrame的merge操作
df1 = DataFrame({'key': ['A', 'B', 'C'], 'data_set1': [1, 2, 3]}) df2 = DataFrame({'key': ['A', 'B', 'C'], 'data_set2': [4, 5, 6]})
pd.merge(df1, df2) pd.merge(df1, df2, on="key") pd.merge(df1, df2, how="inner") # outer pd.merge(df1, df2, how="left") # right
Concatenate和Combine
arr1 = np.arange(9).reshape(3, 3) arr2 = np.arange(9).reshape(3, 3) np.concatenate([arr1, arr2 ]) # axis = 0
s1 = Series([4, 5], index = ['A', 'B']) s2 = Series([1, 2, 3], index = ['X', 'Y', 'Z']) pd.concat([s1, s2]) # axis = 0 pd.concat([s1, s2], axis = 1) # DataFrame
df1 = DataFrame(np.random.randn(4, 3), columns = ['X', 'Y', 'Z']) df1 = DataFrame(np.random.randn(3, 3), columns = ['X', 'Y', 'A']) pd.concat([df1, df2])
s1 = Series([2, np.nan, 4, np.nan], index = ['A', 'B', 'C', 'D']) s2 = Series([1, 2, 3, 4], index = ['A', 'B', 'C', 'D']) s1.combine_first(s2) # 用s2填充s1,s1中的NaN会被s2填充
df1 = DataFrame({ 'X': [1, np.nan, 3, np.nan], 'Y': [5, np.nan, 7, np.nan], 'Z': [9, np.nan, 11, np.nan] }) df2 = DataFrame({ 'Z': [np.nan, 10, np.nan, 12], 'A': [1, 2, 3, 4], }) df1.combine_first(df2)
通过apply进行数据预处理
s1 = Series(['a'] * 7978) df['A'] = s1 # df有7978行 df['A'] = df['A'].apply(str.upper) # 将A这一列变大写
def foo(line): items = line.strip().split(' ') return Series([items[1], items[3], items[5]]) df_tmp = df['data'].apply(foo) df_tmp = df_tmp.rename(columes = {0: "Symbol", 1: "Sqqno", 2: "Price"}) df_new = df.combine_first(df_tmp) del df_new['data']
通过去重进行数据清洗
df['Seqno'].duplicated() # False不重复 True重复 df['Seqno'].drop_duplicates() # 将重复的删掉 df.drop_duplicates() df.drop_duplicates(['Seqno']) # 以Seqno为基准 df.drop_duplicates(['Seqno'], keep='last') # 保留重复的最后一个
时间序列操作基础
from datetime import datetime t1 = datetime(2009, 10, 20)
data_list = [ datetime(2016, 9, 1), datetime(2016, 9, 10), datetime(2017, 9, 1), datetime(2017, 9, 20), datetime(2017, 10, 1) ] s1 = Series(np.random.rand(5), index=date_list) s1[1] # 位置信息访问 s1[datetime(2016, 9, 10)] s1['2016-9-10'] s1['20160910'] s1['2016-09'] # 2016年9月的数据 s1['2016'] # 2016年的数据
date_list_new = pd.date_range(start='2016-01-01', periods=100) # freq = 'D' date_list_new = pd.date_range(start='2016-01-01', periods=100, freq = 'W-MON') # 频率周 周一开始
时间序列数据的采样和画图
t_range = pd.date_range(start='2016-01-01', end = '2016-12-31') s1 = Series(np.random.randn(len(t_range)), index = t_range) s1['2016-01'].mean()
s1_month = s1.resample('M').mean() #按月份采样 取平均值 s1.resample('H').ffill() # foreward填充 s1.resample('H').bfill() # backward填充
t_range = pd.date_range('2016-01-01', '2016-12-31', freq='H') stock-df = DataFrame(index = t_range) stock_df['BABA'] = np.random.randint(80, 160, size=len(t_range)) stock_df['TENCENT'] = np.random.randint(30, 50, size=len(t_range)) stock_df.plot() import matplotlib.pyplot as plt plt.show()
数据分箱技术Binning
score_list = np.random.randint(25, 100, size = 20) bins = [0, 59, 70, 80, 100] score_cut = pd.cut(score_list, bins) pd.value_count(score_cut)
df = DataFrame() df['score'] = score_list df['student'] = [pd.util.testing.rands(3) for i in range(20)] df['Categories'] = pd.cut(df['score'], bins, labels = ['Low', 'OK', 'Good', 'Great'])
数据分组技术GroupBy
g = df.groupby(by = df['city']) g.groups df_bj = g.get_group('BJ') df_bj.mean() g.mean() # DataFrame
GroupBy = Split + Apply + Combine
dict(list(g))['BJ'] for name, group_df in g: print(name) print(group_df)
数据聚合技术Aggregation
def foo(attr): return attr.max() - attr.min() g.agg(foo)
g_new = df.groupby(['city', 'wind']) g_new.groups g_new.get_group(('BJ', 3)) for (name1, name2), group in g_new: pass
透视表
pd.pivot_table(df, index = ['Name']) pd.pivot_table(df, index = ['Name'], aggfunc='sum', values = ['Price']) pd.pivot_table(df, index = ['Name'], aggfunc='sum', values = ['Price'], fill_value=0)