import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
s = pd.Series([1,3,5,np.nan,6,8])
s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
len(s)
6
s.describe()
count 5.000000
mean 4.600000
std 2.701851
min 1.000000
25% 3.000000
50% 5.000000
75% 6.000000
max 8.000000
dtype: float64
dates = pd.date_range('20190101',periods=6)
dates
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | -1.378140 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | -1.245069 |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 1.262588 |
df.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | 0.159091 | -0.455118 | 0.209563 | -0.179960 |
std | 0.468160 | 1.014914 | 0.786212 | 1.293573 |
min | -0.468258 | -1.682376 | -0.781625 | -1.378140 |
25% | -0.140934 | -0.924528 | -0.229347 | -1.274607 |
50% | 0.267372 | -0.774334 | 0.136517 | -0.514590 |
75% | 0.308650 | 0.055257 | 0.501539 | 1.000913 |
max | 0.842686 | 1.143140 | 1.494058 | 1.349425 |
df.shape
(6, 4)
df2 = pd.DataFrame({'A':1.,
'B':pd.Timestamp('20190101'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'})
df2
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | E | F |
0 | 1.0 | 2019-01-01 | 1.0 | 3 | test | foo |
1 | 1.0 | 2019-01-01 | 1.0 | 3 | train | foo |
2 | 1.0 | 2019-01-01 | 1.0 | 3 | test | foo |
3 | 1.0 | 2019-01-01 | 1.0 | 3 | train | foo |
df2.dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
df.head()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | -1.378140 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | -1.245069 |
df.tail()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | -1.378140 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | -1.245069 |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 1.262588 |
df.index
DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
'2019-01-05', '2019-01-06'],
dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
df.values
array([[ 0.31757454, 0.33054893, 0.05537508, -1.28445319],
[-0.27220143, -0.77061807, 0.21765843, 1.34942538],
[ 0.84268621, -0.97335385, 0.59616646, 0.21588867],
[ 0.25286828, -0.77804969, -0.32425479, -1.37813964],
[ 0.28187609, 1.14314031, -0.78162546, -1.24506887],
[-0.4682577 , -1.68237556, 1.49405812, 1.26258772]])
df.describe()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
mean | 0.159091 | -0.455118 | 0.209563 | -0.179960 |
std | 0.468160 | 1.014914 | 0.786212 | 1.293573 |
min | -0.468258 | -1.682376 | -0.781625 | -1.378140 |
25% | -0.140934 | -0.924528 | -0.229347 | -1.274607 |
50% | 0.267372 | -0.774334 | 0.136517 | -0.514590 |
75% | 0.308650 | 0.055257 | 0.501539 | 1.000913 |
max | 0.842686 | 1.143140 | 1.494058 | 1.349425 |
df2.describe(include='all')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | E | F |
count | 4.0 | 4 | 4.0 | 4.0 | 4 | 4 |
unique | NaN | 1 | NaN | NaN | 2 | 1 |
top | NaN | 2019-01-01 00:00:00 | NaN | NaN | train | foo |
freq | NaN | 4 | NaN | NaN | 2 | 4 |
first | NaN | 2019-01-01 00:00:00 | NaN | NaN | NaN | NaN |
last | NaN | 2019-01-01 00:00:00 | NaN | NaN | NaN | NaN |
mean | 1.0 | NaN | 1.0 | 3.0 | NaN | NaN |
std | 0.0 | NaN | 0.0 | 0.0 | NaN | NaN |
min | 1.0 | NaN | 1.0 | 3.0 | NaN | NaN |
25% | 1.0 | NaN | 1.0 | 3.0 | NaN | NaN |
50% | 1.0 | NaN | 1.0 | 3.0 | NaN | NaN |
75% | 1.0 | NaN | 1.0 | 3.0 | NaN | NaN |
max | 1.0 | NaN | 1.0 | 3.0 | NaN | NaN |
df.T
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
2019-01-01 00:00:00 | 2019-01-02 00:00:00 | 2019-01-03 00:00:00 | 2019-01-04 00:00:00 | 2019-01-05 00:00:00 | 2019-01-06 00:00:00 |
A | 0.317575 | -0.272201 | 0.842686 | 0.252868 | 0.281876 | -0.468258 |
B | 0.330549 | -0.770618 | -0.973354 | -0.778050 | 1.143140 | -1.682376 |
C | 0.055375 | 0.217658 | 0.596166 | -0.324255 | -0.781625 | 1.494058 |
D | -1.284453 | 1.349425 | 0.215889 | -1.378140 | -1.245069 | 1.262588 |
df.sort_index(axis=1,ascending=True)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | -1.378140 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | -1.245069 |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 1.262588 |
df.sort_values(by='B')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 1.262588 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | -1.378140 |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | -1.245069 |
df['A']
2019-01-01 0.317575
2019-01-02 -0.272201
2019-01-03 0.842686
2019-01-04 0.252868
2019-01-05 0.281876
2019-01-06 -0.468258
Freq: D, Name: A, dtype: float64
df[0:3]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
df['20190101':'20190103']
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
df.loc[dates[0]]
A 0.317575
B 0.330549
C 0.055375
D -1.284453
Name: 2019-01-01 00:00:00, dtype: float64
dates[0]
Timestamp('2019-01-01 00:00:00', freq='D')
df.loc[:,['A','B']]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B |
2019-01-01 | 0.317575 | 0.330549 |
2019-01-02 | -0.272201 | -0.770618 |
2019-01-03 | 0.842686 | -0.973354 |
2019-01-04 | 0.252868 | -0.778050 |
2019-01-05 | 0.281876 | 1.143140 |
2019-01-06 | -0.468258 | -1.682376 |
df.loc['20190104',['A','B']].shape
(2,)
df.loc[dates[0],'A']
0.3175745430028141
df.at[dates[0],'A']
0.3175745430028141
df.at[dates[0],'A']
df.iat[1,2]
0.21765843113729494
df.iloc[3]
A 0.252868
B -0.778050
C -0.324255
D -1.378140
Name: 2019-01-04 00:00:00, dtype: float64
df.iloc[3,1]
-0.7780496857268889
df.iloc[[1,2,4],[0,2]]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | C |
2019-01-02 | -0.272201 | 0.217658 |
2019-01-03 | 0.842686 | 0.596166 |
2019-01-05 | 0.281876 | -0.781625 |
df.iloc[1:3,:]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
df.iloc[1:3]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
df[df.A>0]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | -1.378140 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | -1.245069 |
df[df>0]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | NaN |
2019-01-02 | NaN | NaN | 0.217658 | 1.349425 |
2019-01-03 | 0.842686 | NaN | 0.596166 | 0.215889 |
2019-01-04 | 0.252868 | NaN | NaN | NaN |
2019-01-05 | 0.281876 | 1.143140 | NaN | NaN |
2019-01-06 | NaN | NaN | 1.494058 | 1.262588 |
df3 = df.copy()
df3['E']=['one','two','three','four','three','one']
df3
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | E |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 | one |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 | two |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 | three |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | -1.378140 | four |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | -1.245069 | three |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 1.262588 | one |
df3[df3['E'].isin(['one','two'])]
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | E |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 | one |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 | two |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 1.262588 | one |
s1= pd.Series([1,2,3,4,5,6],index=pd.date_range('20190102',periods=6))
s1
2019-01-02 1
2019-01-03 2
2019-01-04 3
2019-01-05 4
2019-01-06 5
2019-01-07 6
Freq: D, dtype: int64
df['F']=s1
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F |
2019-01-01 | 0.317575 | 0.330549 | 0.055375 | -1.284453 | NaN |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 1.349425 | 1.0 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 0.215889 | 2.0 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | -1.378140 | 3.0 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | -1.245069 | 4.0 |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 1.262588 | 5.0 |
df.at[dates[0],'A']=0
df.iat[0,1]=0
df.loc[:,'D']=np.array([5]*len(df))
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F |
2019-01-01 | 0.000000 | 0.000000 | 0.055375 | 5 | NaN |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 5 | 1.0 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 5 | 2.0 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | 5 | 3.0 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | 5 | 4.0 |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 5 | 5.0 |
df4 = df.copy()
df4[df4>0]=-df4
df4
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F |
2019-01-01 | 0.000000 | 0.000000 | -0.055375 | -5 | NaN |
2019-01-02 | -0.272201 | -0.770618 | -0.217658 | -5 | -1.0 |
2019-01-03 | -0.842686 | -0.973354 | -0.596166 | -5 | -2.0 |
2019-01-04 | -0.252868 | -0.778050 | -0.324255 | -5 | -3.0 |
2019-01-05 | -0.281876 | -1.143140 | -0.781625 | -5 | -4.0 |
2019-01-06 | -0.468258 | -1.682376 | -1.494058 | -5 | -5.0 |
df1 = df.reindex(index=dates[0:4],columns=list(df.columns)+['E'])
df1.loc[dates[0]:dates[1],'E']=1
df1
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F | E |
2019-01-01 | 0.000000 | 0.000000 | 0.055375 | 5 | NaN | 1.0 |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 5 | 1.0 | 1.0 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 5 | 2.0 | NaN |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | 5 | 3.0 | NaN |
df1.dropna(how='any')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F | E |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 5 | 1.0 | 1.0 |
df1.fillna(value=5)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F | E |
2019-01-01 | 0.000000 | 0.000000 | 0.055375 | 5 | 5.0 | 1.0 |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 5 | 1.0 | 1.0 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 5 | 2.0 | 5.0 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | 5 | 3.0 | 5.0 |
pd.isna(df1)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F | E |
2019-01-01 | False | False | False | False | True | False |
2019-01-02 | False | False | False | False | False | False |
2019-01-03 | False | False | False | False | False | True |
2019-01-04 | False | False | False | False | False | True |
df.mean()
A 0.106162
B -0.510209
C 0.209563
D 5.000000
F 3.000000
dtype: float64
df.mean(1)
2019-01-01 1.263844
2019-01-02 1.034968
2019-01-03 1.493100
2019-01-04 1.430113
2019-01-05 1.928678
2019-01-06 1.868685
Freq: D, dtype: float64
s = pd.Series([1,3,5,np.nan,6,8],index=dates).shift(2)
s
2019-01-01 NaN
2019-01-02 NaN
2019-01-03 1.0
2019-01-04 3.0
2019-01-05 5.0
2019-01-06 NaN
Freq: D, dtype: float64
df.sub(s,axis='index')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F |
2019-01-01 | NaN | NaN | NaN | NaN | NaN |
2019-01-02 | NaN | NaN | NaN | NaN | NaN |
2019-01-03 | -0.157314 | -1.973354 | -0.403834 | 4.0 | 1.0 |
2019-01-04 | -2.747132 | -3.778050 | -3.324255 | 2.0 | 0.0 |
2019-01-05 | -4.718124 | -3.856860 | -5.781625 | 0.0 | -1.0 |
2019-01-06 | NaN | NaN | NaN | NaN | NaN |
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F |
2019-01-01 | 0.000000 | 0.000000 | 0.055375 | 5 | NaN |
2019-01-02 | -0.272201 | -0.770618 | 0.217658 | 5 | 1.0 |
2019-01-03 | 0.842686 | -0.973354 | 0.596166 | 5 | 2.0 |
2019-01-04 | 0.252868 | -0.778050 | -0.324255 | 5 | 3.0 |
2019-01-05 | 0.281876 | 1.143140 | -0.781625 | 5 | 4.0 |
2019-01-06 | -0.468258 | -1.682376 | 1.494058 | 5 | 5.0 |
df.apply(np.cumsum)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | F |
2019-01-01 | 0.000000 | 0.000000 | 0.055375 | 5 | NaN |
2019-01-02 | -0.272201 | -0.770618 | 0.273034 | 10 | 1.0 |
2019-01-03 | 0.570485 | -1.743972 | 0.869200 | 15 | 3.0 |
2019-01-04 | 0.823353 | -2.522022 | 0.544945 | 20 | 6.0 |
2019-01-05 | 1.105229 | -1.378881 | -0.236680 | 25 | 10.0 |
2019-01-06 | 0.636971 | -3.061257 | 1.257378 | 30 | 15.0 |
df.apply(lambda x:x.max()-x.min())
A 1.310944
B 2.825516
C 2.275684
D 0.000000
F 4.000000
dtype: float64
df['F'].value_counts()
5.0 1
4.0 1
3.0 1
2.0 1
1.0 1
Name: F, dtype: int64
s = pd.Series(np.random.randint(0,7,size=10))
s
0 1
1 1
2 1
3 2
4 3
5 2
6 6
7 0
8 4
9 6
dtype: int32
s.dtype
dtype('int32')
s.shape
(10,)
s.value_counts()
1 3
6 2
2 2
4 1
3 1
0 1
dtype: int64
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
s.dtype
dtype('O')
s.str
<pandas.core.strings.StringMethods at 0x22f0313d160>
s.str.lower()
0 a
1 b
2 c
3 aaba
4 baca
5 NaN
6 caba
7 dog
8 cat
dtype: object
df = pd.DataFrame(np.random.randn(10,4))
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | 1 | 2 | 3 |
0 | -0.936960 | -0.210650 | 1.887069 | 0.128005 |
1 | 0.657660 | 0.254801 | -0.092190 | -1.957160 |
2 | -0.920141 | -2.259964 | 0.083965 | 0.371865 |
3 | -0.755403 | 0.633426 | 0.090949 | -0.626964 |
4 | 0.170052 | 1.164537 | -1.193706 | 1.391785 |
5 | 0.084560 | -1.513814 | 0.069032 | 0.099851 |
6 | -0.683663 | 0.031873 | -0.062998 | 0.523253 |
7 | -0.926594 | 0.125286 | -1.894089 | -0.449402 |
8 | 0.610722 | 0.329156 | 0.025149 | -2.673445 |
9 | 0.336673 | 1.205792 | -1.346179 | 0.214389 |
pieces = [df[:3],df[3:7],df[7:]]
pieces
[ 0 1 2 3
0 -0.936960 -0.210650 1.887069 0.128005
1 0.657660 0.254801 -0.092190 -1.957160
2 -0.920141 -2.259964 0.083965 0.371865,
0 1 2 3
3 -0.755403 0.633426 0.090949 -0.626964
4 0.170052 1.164537 -1.193706 1.391785
5 0.084560 -1.513814 0.069032 0.099851
6 -0.683663 0.031873 -0.062998 0.523253,
0 1 2 3
7 -0.926594 0.125286 -1.894089 -0.449402
8 0.610722 0.329156 0.025149 -2.673445
9 0.336673 1.205792 -1.346179 0.214389]
pd.concat(pieces)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | 1 | 2 | 3 |
0 | -0.936960 | -0.210650 | 1.887069 | 0.128005 |
1 | 0.657660 | 0.254801 | -0.092190 | -1.957160 |
2 | -0.920141 | -2.259964 | 0.083965 | 0.371865 |
3 | -0.755403 | 0.633426 | 0.090949 | -0.626964 |
4 | 0.170052 | 1.164537 | -1.193706 | 1.391785 |
5 | 0.084560 | -1.513814 | 0.069032 | 0.099851 |
6 | -0.683663 | 0.031873 | -0.062998 | 0.523253 |
7 | -0.926594 | 0.125286 | -1.894089 | -0.449402 |
8 | 0.610722 | 0.329156 | 0.025149 | -2.673445 |
9 | 0.336673 | 1.205792 | -1.346179 | 0.214389 |
concat
- axis=1列合并;axis=0行合并(default)
- join=‘inner’只返回合并的共同对象;join=‘outer’返回合并后的所有列
- ignore_index=True会自动连续生成索引值;ignore_index=False则不改变原有的索引值
- verify_integrity会检查当前合并是否重复索引
- keys会导致生成多维数组
- name会重新命名列名。
pd.concat(pieces,axis=1)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 |
0 | -0.936960 | -0.210650 | 1.887069 | 0.128005 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0.657660 | 0.254801 | -0.092190 | -1.957160 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | -0.920141 | -2.259964 | 0.083965 | 0.371865 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | NaN | NaN | NaN | -0.755403 | 0.633426 | 0.090949 | -0.626964 | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | 0.170052 | 1.164537 | -1.193706 | 1.391785 | NaN | NaN | NaN | NaN |
5 | NaN | NaN | NaN | NaN | 0.084560 | -1.513814 | 0.069032 | 0.099851 | NaN | NaN | NaN | NaN |
6 | NaN | NaN | NaN | NaN | -0.683663 | 0.031873 | -0.062998 | 0.523253 | NaN | NaN | NaN | NaN |
7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -0.926594 | 0.125286 | -1.894089 | -0.449402 |
8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.610722 | 0.329156 | 0.025149 | -2.673445 |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.336673 | 1.205792 | -1.346179 | 0.214389 |
pd.concat(pieces,join='outer',axis=1)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 | 0 | 1 | 2 | 3 |
0 | -0.936960 | -0.210650 | 1.887069 | 0.128005 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 0.657660 | 0.254801 | -0.092190 | -1.957160 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | -0.920141 | -2.259964 | 0.083965 | 0.371865 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | NaN | NaN | NaN | -0.755403 | 0.633426 | 0.090949 | -0.626964 | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | 0.170052 | 1.164537 | -1.193706 | 1.391785 | NaN | NaN | NaN | NaN |
5 | NaN | NaN | NaN | NaN | 0.084560 | -1.513814 | 0.069032 | 0.099851 | NaN | NaN | NaN | NaN |
6 | NaN | NaN | NaN | NaN | -0.683663 | 0.031873 | -0.062998 | 0.523253 | NaN | NaN | NaN | NaN |
7 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | -0.926594 | 0.125286 | -1.894089 | -0.449402 |
8 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.610722 | 0.329156 | 0.025149 | -2.673445 |
9 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.336673 | 1.205792 | -1.346179 | 0.214389 |
left = pd.DataFrame({'key':['foo','bar'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','bar'],'rval':[4,5]})
left
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
right
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
pd.merge(left,right,on='key')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key | lval | rval |
0 | foo | 1 | 4 |
1 | bar | 2 | 5 |
left = pd.DataFrame({'key':['foo','foo'],'lval':[1,2]})
right = pd.DataFrame({'key':['foo','foo'],'rval':[4,5]})
left
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
right
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
pd.merge(left,right,on='key')
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
key | lval | rval |
0 | foo | 1 | 4 |
1 | foo | 1 | 5 |
2 | foo | 2 | 4 |
3 | foo | 2 | 5 |
df = pd.DataFrame(np.random.randn(8,4),columns=list('ABCD'))
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
0 | 0.862756 | -1.436692 | 0.367396 | 1.033803 |
1 | -0.732357 | -0.350199 | -0.116083 | -2.435210 |
2 | 0.316582 | 0.468616 | 0.433001 | -0.443120 |
3 | -0.189939 | -2.437137 | 0.126893 | -2.273711 |
4 | 0.913514 | -0.752727 | -1.651140 | 1.156839 |
5 | -0.314581 | 1.296585 | 0.579130 | -0.871556 |
6 | 0.361473 | 0.687854 | -1.044602 | 0.233138 |
7 | 0.045199 | 2.176608 | -0.258569 | -1.018576 |
s = df.iloc[3]
df.append(s,ignore_index=False)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
0 | 0.862756 | -1.436692 | 0.367396 | 1.033803 |
1 | -0.732357 | -0.350199 | -0.116083 | -2.435210 |
2 | 0.316582 | 0.468616 | 0.433001 | -0.443120 |
3 | -0.189939 | -2.437137 | 0.126893 | -2.273711 |
4 | 0.913514 | -0.752727 | -1.651140 | 1.156839 |
5 | -0.314581 | 1.296585 | 0.579130 | -0.871556 |
6 | 0.361473 | 0.687854 | -1.044602 | 0.233138 |
7 | 0.045199 | 2.176608 | -0.258569 | -1.018576 |
3 | -0.189939 | -2.437137 | 0.126893 | -2.273711 |
“分组依据”是指涉及以下一个或多个步骤的过程:
根据某些标准将数据分成组
独立地将函数应用于每个组
将结果组合成数据结构
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
....: 'foo', 'bar', 'foo', 'foo'],
....: 'B' : ['one', 'one', 'two', 'three',
....: 'two', 'two', 'one', 'three'],
....: 'C' : np.random.randn(8),
....: 'D' : np.random.randn(8)})
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
0 | foo | one | 0.982988 | -0.386029 |
1 | bar | one | 1.632482 | -0.327520 |
2 | foo | two | 0.149619 | -0.138297 |
3 | bar | three | -1.480397 | 1.105690 |
4 | foo | two | 0.647044 | -1.097276 |
5 | bar | two | -0.675596 | 0.250176 |
6 | foo | one | 0.437309 | 1.031742 |
7 | foo | three | 0.434659 | 1.197695 |
df.groupby('A').sum()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
C | D |
A |
bar | -0.523512 | 1.028346 |
foo | 2.651618 | 0.607834 |
df.groupby('A').max()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
B | C | D |
A |
bar | two | 1.632482 | 1.105690 |
foo | two | 0.982988 | 1.197695 |
df.groupby(['A','B']).sum()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
C | D |
A | B |
bar | one | 1.632482 | -0.327520 |
three | -1.480397 | 1.105690 |
two | -0.675596 | 0.250176 |
foo | one | 1.420297 | 0.645713 |
three | 0.434659 | 1.197695 |
two | 0.796663 | -1.235574 |
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz',
....: 'foo', 'foo', 'qux', 'qux'],
....: ['one', 'two', 'one', 'two',
....: 'one', 'two', 'one', 'two']]))
tuples
[('bar', 'one'),
('bar', 'two'),
('baz', 'one'),
('baz', 'two'),
('foo', 'one'),
('foo', 'two'),
('qux', 'one'),
('qux', 'two')]
zip() 函数用于将可迭代的对象作为参数,将对象中对应的元素打包成一个个元组,然后返回由这些元组组成的列表,在python3中用于减少内存
index = pd.MultiIndex.from_tuples(tuples,names=['first','second'])
index
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
labels=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8,2),index=index,columns=['A','B'])
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B |
first | second |
bar | one | -0.752364 | 0.982241 |
two | -0.626690 | 1.358099 |
baz | one | 0.342360 | -0.618870 |
two | -0.399448 | -0.500175 |
foo | one | -0.746328 | -0.244752 |
two | 1.556458 | -1.340098 |
qux | one | -0.569393 | -0.340625 |
two | 0.513714 | 0.151477 |
stacked = df.stack()
stacked
first second
bar one A -0.752364
B 0.982241
two A -0.626690
B 1.358099
baz one A 0.342360
B -0.618870
two A -0.399448
B -0.500175
foo one A -0.746328
B -0.244752
two A 1.556458
B -1.340098
qux one A -0.569393
B -0.340625
two A 0.513714
B 0.151477
dtype: float64
stacked.unstack()
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B |
first | second |
bar | one | -0.752364 | 0.982241 |
two | -0.626690 | 1.358099 |
baz | one | 0.342360 | -0.618870 |
two | -0.399448 | -0.500175 |
foo | one | -0.746328 | -0.244752 |
two | 1.556458 | -1.340098 |
qux | one | -0.569393 | -0.340625 |
two | 0.513714 | 0.151477 |
stacked.unstack(1)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
second | one | two |
first |
bar | A | -0.752364 | -0.626690 |
B | 0.982241 | 1.358099 |
baz | A | 0.342360 | -0.399448 |
B | -0.618870 | -0.500175 |
foo | A | -0.746328 | 1.556458 |
B | -0.244752 | -1.340098 |
qux | A | -0.569393 | 0.513714 |
B | -0.340625 | 0.151477 |
stacked.unstack(0)
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
first | bar | baz | foo | qux |
second |
one | A | -0.752364 | 0.342360 | -0.746328 | -0.569393 |
B | 0.982241 | -0.618870 | -0.244752 | -0.340625 |
two | A | -0.626690 | -0.399448 | 1.556458 | 0.513714 |
B | 1.358099 | -0.500175 | -1.340098 | 0.151477 |
df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
.....: 'B' : ['A', 'B', 'C'] * 4,
.....: 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
.....: 'D' : np.random.randn(12),
.....: 'E' : np.random.randn(12)})
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D | E |
0 | one | A | foo | 0.346892 | -0.615216 |
1 | one | B | foo | 0.808681 | 0.161598 |
2 | two | C | foo | -1.324783 | -0.088082 |
3 | three | A | bar | -0.227795 | -0.180022 |
4 | one | B | bar | 1.130028 | -0.361439 |
5 | one | C | bar | 0.510629 | -1.466063 |
6 | two | A | foo | 0.379503 | 0.008279 |
7 | three | B | foo | 0.921087 | -0.148614 |
8 | one | C | foo | 0.332222 | -0.127428 |
9 | one | A | bar | -0.784876 | -0.736117 |
10 | two | B | bar | -0.793957 | 1.705022 |
11 | three | C | bar | -0.898485 | 1.038166 |
pd.pivot_table(df,values='D',index=['A','B'],columns=['C'])
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
C | bar | foo |
A | B |
one | A | -0.784876 | 0.346892 |
B | 1.130028 | 0.808681 |
C | 0.510629 | 0.332222 |
three | A | -0.227795 | NaN |
B | NaN | 0.921087 |
C | -0.898485 | NaN |
two | A | NaN | 0.379503 |
B | -0.793957 | NaN |
C | NaN | -1.324783 |
rng = pd.date_range('1/1/2012', periods=100, freq='S')
rng
DatetimeIndex(['2012-01-01 00:00:00', '2012-01-01 00:00:01',
'2012-01-01 00:00:02', '2012-01-01 00:00:03',
'2012-01-01 00:00:04', '2012-01-01 00:00:05',
'2012-01-01 00:00:06', '2012-01-01 00:00:07',
'2012-01-01 00:00:08', '2012-01-01 00:00:09',
'2012-01-01 00:00:10', '2012-01-01 00:00:11',
'2012-01-01 00:00:12', '2012-01-01 00:00:13',
'2012-01-01 00:00:14', '2012-01-01 00:00:15',
'2012-01-01 00:00:16', '2012-01-01 00:00:17',
'2012-01-01 00:00:18', '2012-01-01 00:00:19',
'2012-01-01 00:00:20', '2012-01-01 00:00:21',
'2012-01-01 00:00:22', '2012-01-01 00:00:23',
'2012-01-01 00:00:24', '2012-01-01 00:00:25',
'2012-01-01 00:00:26', '2012-01-01 00:00:27',
'2012-01-01 00:00:28', '2012-01-01 00:00:29',
'2012-01-01 00:00:30', '2012-01-01 00:00:31',
'2012-01-01 00:00:32', '2012-01-01 00:00:33',
'2012-01-01 00:00:34', '2012-01-01 00:00:35',
'2012-01-01 00:00:36', '2012-01-01 00:00:37',
'2012-01-01 00:00:38', '2012-01-01 00:00:39',
'2012-01-01 00:00:40', '2012-01-01 00:00:41',
'2012-01-01 00:00:42', '2012-01-01 00:00:43',
'2012-01-01 00:00:44', '2012-01-01 00:00:45',
'2012-01-01 00:00:46', '2012-01-01 00:00:47',
'2012-01-01 00:00:48', '2012-01-01 00:00:49',
'2012-01-01 00:00:50', '2012-01-01 00:00:51',
'2012-01-01 00:00:52', '2012-01-01 00:00:53',
'2012-01-01 00:00:54', '2012-01-01 00:00:55',
'2012-01-01 00:00:56', '2012-01-01 00:00:57',
'2012-01-01 00:00:58', '2012-01-01 00:00:59',
'2012-01-01 00:01:00', '2012-01-01 00:01:01',
'2012-01-01 00:01:02', '2012-01-01 00:01:03',
'2012-01-01 00:01:04', '2012-01-01 00:01:05',
'2012-01-01 00:01:06', '2012-01-01 00:01:07',
'2012-01-01 00:01:08', '2012-01-01 00:01:09',
'2012-01-01 00:01:10', '2012-01-01 00:01:11',
'2012-01-01 00:01:12', '2012-01-01 00:01:13',
'2012-01-01 00:01:14', '2012-01-01 00:01:15',
'2012-01-01 00:01:16', '2012-01-01 00:01:17',
'2012-01-01 00:01:18', '2012-01-01 00:01:19',
'2012-01-01 00:01:20', '2012-01-01 00:01:21',
'2012-01-01 00:01:22', '2012-01-01 00:01:23',
'2012-01-01 00:01:24', '2012-01-01 00:01:25',
'2012-01-01 00:01:26', '2012-01-01 00:01:27',
'2012-01-01 00:01:28', '2012-01-01 00:01:29',
'2012-01-01 00:01:30', '2012-01-01 00:01:31',
'2012-01-01 00:01:32', '2012-01-01 00:01:33',
'2012-01-01 00:01:34', '2012-01-01 00:01:35',
'2012-01-01 00:01:36', '2012-01-01 00:01:37',
'2012-01-01 00:01:38', '2012-01-01 00:01:39'],
dtype='datetime64[ns]', freq='S')
ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)
ts
2012-01-01 00:00:00 225
2012-01-01 00:00:01 354
2012-01-01 00:00:02 438
2012-01-01 00:00:03 440
2012-01-01 00:00:04 9
2012-01-01 00:00:05 179
2012-01-01 00:00:06 396
2012-01-01 00:00:07 200
2012-01-01 00:00:08 413
2012-01-01 00:00:09 490
2012-01-01 00:00:10 37
2012-01-01 00:00:11 57
2012-01-01 00:00:12 33
2012-01-01 00:00:13 388
2012-01-01 00:00:14 44
2012-01-01 00:00:15 95
2012-01-01 00:00:16 8
2012-01-01 00:00:17 1
2012-01-01 00:00:18 307
2012-01-01 00:00:19 332
2012-01-01 00:00:20 20
2012-01-01 00:00:21 84
2012-01-01 00:00:22 309
2012-01-01 00:00:23 308
2012-01-01 00:00:24 67
2012-01-01 00:00:25 245
2012-01-01 00:00:26 180
2012-01-01 00:00:27 9
2012-01-01 00:00:28 126
2012-01-01 00:00:29 232
...
2012-01-01 00:01:10 409
2012-01-01 00:01:11 355
2012-01-01 00:01:12 70
2012-01-01 00:01:13 266
2012-01-01 00:01:14 118
2012-01-01 00:01:15 325
2012-01-01 00:01:16 214
2012-01-01 00:01:17 3
2012-01-01 00:01:18 143
2012-01-01 00:01:19 28
2012-01-01 00:01:20 56
2012-01-01 00:01:21 120
2012-01-01 00:01:22 99
2012-01-01 00:01:23 102
2012-01-01 00:01:24 71
2012-01-01 00:01:25 464
2012-01-01 00:01:26 489
2012-01-01 00:01:27 404
2012-01-01 00:01:28 356
2012-01-01 00:01:29 197
2012-01-01 00:01:30 390
2012-01-01 00:01:31 345
2012-01-01 00:01:32 115
2012-01-01 00:01:33 377
2012-01-01 00:01:34 388
2012-01-01 00:01:35 39
2012-01-01 00:01:36 406
2012-01-01 00:01:37 408
2012-01-01 00:01:38 410
2012-01-01 00:01:39 256
Freq: S, Length: 100, dtype: int32
rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')
rng
DatetimeIndex(['2012-03-06', '2012-03-07', '2012-03-08', '2012-03-09',
'2012-03-10'],
dtype='datetime64[ns]', freq='D')
ts_utc = ts.tz_localize('UTC')
ts_utc
2012-01-01 00:00:00+00:00 225
2012-01-01 00:00:01+00:00 354
2012-01-01 00:00:02+00:00 438
2012-01-01 00:00:03+00:00 440
2012-01-01 00:00:04+00:00 9
2012-01-01 00:00:05+00:00 179
2012-01-01 00:00:06+00:00 396
2012-01-01 00:00:07+00:00 200
2012-01-01 00:00:08+00:00 413
2012-01-01 00:00:09+00:00 490
2012-01-01 00:00:10+00:00 37
2012-01-01 00:00:11+00:00 57
2012-01-01 00:00:12+00:00 33
2012-01-01 00:00:13+00:00 388
2012-01-01 00:00:14+00:00 44
2012-01-01 00:00:15+00:00 95
2012-01-01 00:00:16+00:00 8
2012-01-01 00:00:17+00:00 1
2012-01-01 00:00:18+00:00 307
2012-01-01 00:00:19+00:00 332
2012-01-01 00:00:20+00:00 20
2012-01-01 00:00:21+00:00 84
2012-01-01 00:00:22+00:00 309
2012-01-01 00:00:23+00:00 308
2012-01-01 00:00:24+00:00 67
2012-01-01 00:00:25+00:00 245
2012-01-01 00:00:26+00:00 180
2012-01-01 00:00:27+00:00 9
2012-01-01 00:00:28+00:00 126
2012-01-01 00:00:29+00:00 232
...
2012-01-01 00:01:10+00:00 409
2012-01-01 00:01:11+00:00 355
2012-01-01 00:01:12+00:00 70
2012-01-01 00:01:13+00:00 266
2012-01-01 00:01:14+00:00 118
2012-01-01 00:01:15+00:00 325
2012-01-01 00:01:16+00:00 214
2012-01-01 00:01:17+00:00 3
2012-01-01 00:01:18+00:00 143
2012-01-01 00:01:19+00:00 28
2012-01-01 00:01:20+00:00 56
2012-01-01 00:01:21+00:00 120
2012-01-01 00:01:22+00:00 99
2012-01-01 00:01:23+00:00 102
2012-01-01 00:01:24+00:00 71
2012-01-01 00:01:25+00:00 464
2012-01-01 00:01:26+00:00 489
2012-01-01 00:01:27+00:00 404
2012-01-01 00:01:28+00:00 356
2012-01-01 00:01:29+00:00 197
2012-01-01 00:01:30+00:00 390
2012-01-01 00:01:31+00:00 345
2012-01-01 00:01:32+00:00 115
2012-01-01 00:01:33+00:00 377
2012-01-01 00:01:34+00:00 388
2012-01-01 00:01:35+00:00 39
2012-01-01 00:01:36+00:00 406
2012-01-01 00:01:37+00:00 408
2012-01-01 00:01:38+00:00 410
2012-01-01 00:01:39+00:00 256
Freq: S, Length: 100, dtype: int32
ts_utc.tz_convert('US/Eastern')
2011-12-31 19:00:00-05:00 225
2011-12-31 19:00:01-05:00 354
2011-12-31 19:00:02-05:00 438
2011-12-31 19:00:03-05:00 440
2011-12-31 19:00:04-05:00 9
2011-12-31 19:00:05-05:00 179
2011-12-31 19:00:06-05:00 396
2011-12-31 19:00:07-05:00 200
2011-12-31 19:00:08-05:00 413
2011-12-31 19:00:09-05:00 490
2011-12-31 19:00:10-05:00 37
2011-12-31 19:00:11-05:00 57
2011-12-31 19:00:12-05:00 33
2011-12-31 19:00:13-05:00 388
2011-12-31 19:00:14-05:00 44
2011-12-31 19:00:15-05:00 95
2011-12-31 19:00:16-05:00 8
2011-12-31 19:00:17-05:00 1
2011-12-31 19:00:18-05:00 307
2011-12-31 19:00:19-05:00 332
2011-12-31 19:00:20-05:00 20
2011-12-31 19:00:21-05:00 84
2011-12-31 19:00:22-05:00 309
2011-12-31 19:00:23-05:00 308
2011-12-31 19:00:24-05:00 67
2011-12-31 19:00:25-05:00 245
2011-12-31 19:00:26-05:00 180
2011-12-31 19:00:27-05:00 9
2011-12-31 19:00:28-05:00 126
2011-12-31 19:00:29-05:00 232
...
2011-12-31 19:01:10-05:00 409
2011-12-31 19:01:11-05:00 355
2011-12-31 19:01:12-05:00 70
2011-12-31 19:01:13-05:00 266
2011-12-31 19:01:14-05:00 118
2011-12-31 19:01:15-05:00 325
2011-12-31 19:01:16-05:00 214
2011-12-31 19:01:17-05:00 3
2011-12-31 19:01:18-05:00 143
2011-12-31 19:01:19-05:00 28
2011-12-31 19:01:20-05:00 56
2011-12-31 19:01:21-05:00 120
2011-12-31 19:01:22-05:00 99
2011-12-31 19:01:23-05:00 102
2011-12-31 19:01:24-05:00 71
2011-12-31 19:01:25-05:00 464
2011-12-31 19:01:26-05:00 489
2011-12-31 19:01:27-05:00 404
2011-12-31 19:01:28-05:00 356
2011-12-31 19:01:29-05:00 197
2011-12-31 19:01:30-05:00 390
2011-12-31 19:01:31-05:00 345
2011-12-31 19:01:32-05:00 115
2011-12-31 19:01:33-05:00 377
2011-12-31 19:01:34-05:00 388
2011-12-31 19:01:35-05:00 39
2011-12-31 19:01:36-05:00 406
2011-12-31 19:01:37-05:00 408
2011-12-31 19:01:38-05:00 410
2011-12-31 19:01:39-05:00 256
Freq: S, Length: 100, dtype: int32
prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')
prng
PeriodIndex(['1990Q1', '1990Q2', '1990Q3', '1990Q4', '1991Q1', '1991Q2',
'1991Q3', '1991Q4', '1992Q1', '1992Q2', '1992Q3', '1992Q4',
'1993Q1', '1993Q2', '1993Q3', '1993Q4', '1994Q1', '1994Q2',
'1994Q3', '1994Q4', '1995Q1', '1995Q2', '1995Q3', '1995Q4',
'1996Q1', '1996Q2', '1996Q3', '1996Q4', '1997Q1', '1997Q2',
'1997Q3', '1997Q4', '1998Q1', '1998Q2', '1998Q3', '1998Q4',
'1999Q1', '1999Q2', '1999Q3', '1999Q4', '2000Q1', '2000Q2',
'2000Q3', '2000Q4'],
dtype='period[Q-NOV]', freq='Q-NOV')
ts = pd.Series(np.random.randn(len(prng)), prng)
ts
1990Q1 1.796304
1990Q2 0.659808
1990Q3 -0.647755
1990Q4 1.846486
1991Q1 0.488348
1991Q2 1.830351
1991Q3 -1.658804
1991Q4 0.585780
1992Q1 -0.596026
1992Q2 -1.900346
1992Q3 -0.066638
1992Q4 0.419037
1993Q1 0.055711
1993Q2 -2.103900
1993Q3 0.229944
1993Q4 0.317348
1994Q1 -0.776638
1994Q2 -0.241438
1994Q3 -0.587104
1994Q4 0.825772
1995Q1 2.444721
1995Q2 0.803142
1995Q3 0.494378
1995Q4 -0.984900
1996Q1 -0.431641
1996Q2 0.766768
1996Q3 -1.176313
1996Q4 0.339700
1997Q1 -1.523029
1997Q2 0.512173
1997Q3 1.359914
1997Q4 0.564407
1998Q1 0.354859
1998Q2 -0.493561
1998Q3 0.514986
1998Q4 -0.156142
1999Q1 1.047135
1999Q2 0.648944
1999Q3 -1.581937
1999Q4 0.261181
2000Q1 -0.809498
2000Q2 1.102175
2000Q3 0.424905
2000Q4 -0.775245
Freq: Q-NOV, dtype: float64
ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9
ts.head()
1990-03-01 09:00 1.796304
1990-06-01 09:00 0.659808
1990-09-01 09:00 -0.647755
1990-12-01 09:00 1.846486
1991-03-01 09:00 0.488348
Freq: H, dtype: float64
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | raw_grade |
0 | 1 | a |
1 | 2 | b |
2 | 3 | b |
3 | 4 | a |
4 | 5 | a |
5 | 6 | e |
df["grade"] = df["raw_grade"].astype("category")
df["grade"]
0 a
1 b
2 b
3 a
4 a
5 e
Name: grade, dtype: category
Categories (3, object): [a, b, e]
df["grade"].cat.categories = ["very good", "good", "very bad"]
df["grade"]
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (3, object): [very good, good, very bad]
df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])
df["grade"]
0 very good
1 good
2 good
3 very good
4 very good
5 very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]
df.sort_values(by="grade")
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
id | raw_grade | grade |
5 | 6 | e | very bad |
1 | 2 | b | good |
2 | 3 | b | good |
0 | 1 | a | very good |
3 | 4 | a | very good |
4 | 5 | a | very good |
df.groupby("grade").size()
grade
very bad 1
bad 0
medium 0
good 2
very good 3
dtype: int64
ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))
ts
2000-01-01 0.585074
2000-01-02 0.605786
2000-01-03 0.697632
2000-01-04 -0.783338
2000-01-05 1.150780
2000-01-06 -0.718491
2000-01-07 0.696745
2000-01-08 0.270574
2000-01-09 0.657496
2000-01-10 -2.613661
2000-01-11 -1.978929
2000-01-12 0.325563
2000-01-13 0.286470
2000-01-14 -0.315502
2000-01-15 0.487703
2000-01-16 -1.830420
2000-01-17 0.847074
2000-01-18 -2.363392
2000-01-19 0.139429
2000-01-20 -0.512045
2000-01-21 0.209301
2000-01-22 -0.202987
2000-01-23 -0.605512
2000-01-24 0.113967
2000-01-25 -0.546799
2000-01-26 1.758145
2000-01-27 0.299658
2000-01-28 -0.614838
2000-01-29 0.262877
2000-01-30 0.021676
...
2002-08-28 0.222110
2002-08-29 -1.846013
2002-08-30 -0.094660
2002-08-31 1.281895
2002-09-01 -1.072053
2002-09-02 0.503427
2002-09-03 -0.499512
2002-09-04 -1.080912
2002-09-05 -0.780288
2002-09-06 -0.537608
2002-09-07 -0.991904
2002-09-08 0.159327
2002-09-09 0.224638
2002-09-10 2.063388
2002-09-11 1.217366
2002-09-12 0.603689
2002-09-13 0.832689
2002-09-14 -1.788089
2002-09-15 -2.183370
2002-09-16 -0.759798
2002-09-17 -0.836241
2002-09-18 0.298536
2002-09-19 1.969939
2002-09-20 -0.688728
2002-09-21 -0.964116
2002-09-22 -1.279596
2002-09-23 0.357739
2002-09-24 1.253534
2002-09-25 -0.798673
2002-09-26 -1.023241
Freq: D, Length: 1000, dtype: float64
ts.cumsum()
2000-01-01 0.585074
2000-01-02 1.190860
2000-01-03 1.888493
2000-01-04 1.105155
2000-01-05 2.255935
2000-01-06 1.537445
2000-01-07 2.234190
2000-01-08 2.504764
2000-01-09 3.162260
2000-01-10 0.548599
2000-01-11 -1.430329
2000-01-12 -1.104767
2000-01-13 -0.818296
2000-01-14 -1.133798
2000-01-15 -0.646095
2000-01-16 -2.476516
2000-01-17 -1.629442
2000-01-18 -3.992834
2000-01-19 -3.853405
2000-01-20 -4.365450
2000-01-21 -4.156149
2000-01-22 -4.359136
2000-01-23 -4.964649
2000-01-24 -4.850682
2000-01-25 -5.397481
2000-01-26 -3.639336
2000-01-27 -3.339678
2000-01-28 -3.954516
2000-01-29 -3.691639
2000-01-30 -3.669963
...
2002-08-28 -12.290664
2002-08-29 -14.136676
2002-08-30 -14.231337
2002-08-31 -12.949442
2002-09-01 -14.021495
2002-09-02 -13.518068
2002-09-03 -14.017579
2002-09-04 -15.098492
2002-09-05 -15.878779
2002-09-06 -16.416387
2002-09-07 -17.408292
2002-09-08 -17.248965
2002-09-09 -17.024327
2002-09-10 -14.960940
2002-09-11 -13.743574
2002-09-12 -13.139885
2002-09-13 -12.307196
2002-09-14 -14.095285
2002-09-15 -16.278655
2002-09-16 -17.038453
2002-09-17 -17.874694
2002-09-18 -17.576157
2002-09-19 -15.606219
2002-09-20 -16.294946
2002-09-21 -17.259062
2002-09-22 -18.538658
2002-09-23 -18.180919
2002-09-24 -16.927385
2002-09-25 -17.726058
2002-09-26 -18.749299
Freq: D, Length: 1000, dtype: float64
ts.plot()
<matplotlib.axes._subplots.AxesSubplot at 0x22f031729e8>

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
.....: columns=['A', 'B', 'C', 'D'])
df
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
A | B | C | D |
2000-01-01 | -0.434770 | 1.797170 | 0.354820 | -0.166193 |
2000-01-02 | 0.420233 | 0.823251 | 0.468701 | -0.582949 |
2000-01-03 | 0.674668 | 1.032230 | 1.134346 | 0.696656 |
2000-01-04 | 0.948684 | 0.188295 | -0.604249 | -0.062779 |
2000-01-05 | -1.381247 | -0.360335 | -0.346491 | 1.072618 |
2000-01-06 | 0.492510 | 1.924341 | 0.522605 | 0.293788 |
2000-01-07 | -1.129093 | 0.063874 | 0.099183 | 0.557496 |
2000-01-08 | 1.142263 | -0.309192 | 1.140049 | 1.007656 |
2000-01-09 | 1.751273 | -0.747153 | 0.795127 | -0.480155 |
2000-01-10 | -1.519661 | -1.187734 | 0.417908 | -0.675147 |
2000-01-11 | -0.096192 | 1.095308 | 0.094648 | 1.485943 |
2000-01-12 | 0.109115 | -0.213535 | -0.927250 | 1.189941 |
2000-01-13 | -0.787367 | -0.919787 | 1.286709 | 0.894471 |
2000-01-14 | -0.584850 | 0.794088 | 0.533716 | -0.159539 |
2000-01-15 | -1.352332 | -0.880446 | 0.041934 | 0.002573 |
2000-01-16 | 0.317933 | 0.957925 | 0.813780 | 0.952499 |
2000-01-17 | 0.950317 | 0.162642 | -0.018575 | -0.940598 |
2000-01-18 | -2.021125 | 1.592108 | 0.219355 | -1.300103 |
2000-01-19 | -0.673145 | -1.852674 | -0.492845 | 0.070786 |
2000-01-20 | -0.562802 | -0.504083 | 0.980132 | -0.079636 |
2000-01-21 | 0.693927 | 0.276601 | -0.502267 | 1.824789 |
2000-01-22 | 0.240543 | -0.049004 | 0.051460 | -1.093965 |
2000-01-23 | 0.159181 | 0.559377 | 0.353952 | -1.750909 |
2000-01-24 | -1.009695 | -0.169914 | 2.214441 | -1.301680 |
2000-01-25 | 0.741394 | -0.206067 | -1.250305 | -2.021061 |
2000-01-26 | -1.050527 | -0.448726 | 0.744841 | 0.559876 |
2000-01-27 | -0.268987 | 0.755171 | -0.865320 | -0.077159 |
2000-01-28 | -1.445525 | -0.443887 | 0.048399 | 0.295317 |
2000-01-29 | -0.348641 | -0.570866 | 0.446533 | -0.745215 |
2000-01-30 | -0.803883 | 0.719817 | 0.035095 | -0.057671 |
... | ... | ... | ... | ... |
2002-08-28 | -1.400857 | -1.993967 | -0.563839 | -0.553431 |
2002-08-29 | -0.860120 | -0.252746 | -0.585336 | 0.083630 |
2002-08-30 | 0.677218 | 0.113083 | -0.507485 | -1.247440 |
2002-08-31 | 1.901913 | 0.124469 | -0.482948 | 0.093981 |
2002-09-01 | 1.728861 | 1.909778 | -1.206848 | -1.324399 |
2002-09-02 | 1.419153 | -1.000495 | -0.117854 | -0.630926 |
2002-09-03 | 0.716920 | -0.831795 | 2.443522 | -0.247801 |
2002-09-04 | -0.886588 | -0.487240 | 0.476527 | 1.273604 |
2002-09-05 | -2.361533 | -0.074533 | -1.095040 | 0.087406 |
2002-09-06 | -1.225924 | -0.444836 | 0.378192 | -0.785585 |
2002-09-07 | -1.064395 | 0.046003 | 0.148525 | 0.393557 |
2002-09-08 | -0.294659 | 0.912430 | -0.795767 | 0.064672 |
2002-09-09 | 0.276846 | 0.993007 | -0.493192 | 0.673319 |
2002-09-10 | 1.676072 | 0.102106 | -1.286082 | -1.454404 |
2002-09-11 | 2.124521 | 0.069451 | 0.495054 | 0.148496 |
2002-09-12 | 0.821348 | -0.880714 | 0.933978 | 1.869043 |
2002-09-13 | -0.890738 | -1.263920 | 0.128660 | -0.282550 |
2002-09-14 | -1.097484 | 0.652124 | 0.702043 | -0.552927 |
2002-09-15 | 0.161343 | 0.157393 | 0.851718 | -1.265120 |
2002-09-16 | 0.865516 | -1.196734 | -0.985248 | -1.472387 |
2002-09-17 | -0.539248 | 1.388908 | -0.870515 | -0.671165 |
2002-09-18 | 1.154511 | 0.879535 | -0.249820 | -0.393302 |
2002-09-19 | 1.237163 | 0.668046 | 0.917817 | 0.300664 |
2002-09-20 | -0.187801 | 0.173142 | -0.225307 | 2.142230 |
2002-09-21 | 0.517452 | -0.547158 | 1.587477 | -0.922776 |
2002-09-22 | 0.424784 | 0.696831 | 1.340258 | 1.252117 |
2002-09-23 | -0.687751 | -0.006990 | -0.607220 | 0.709964 |
2002-09-24 | -1.811347 | 0.200485 | 2.117700 | -0.468944 |
2002-09-25 | -0.431668 | -0.385997 | 0.303936 | 0.817534 |
2002-09-26 | 0.678959 | 1.061957 | 1.252870 | 0.735550 |
1000 rows × 4 columns
df = df.cumsum()
plt.figure(); df.plot(); plt.legend(loc='best')
<matplotlib.legend.Legend at 0x22f052c62b0>
<Figure size 432x288 with 0 Axes>
