data read

# Reading a csv into Pandas.
df = pd.read_csv('uk_rain_2014.csv', encoding='utf-8', header=0)

# Read n rows
data = pd.read_csv('data.csv', nrows = 5)

# Getting first x rows.
df.head(5)

# Getting last x rows.
df.tail(5)

# Changing column labels.
df.columns = ['water_year','rain_octsep', 'outflow_octsep','rain_decfeb', 'outflow_decfeb', 'rain_junaug', 'outflow_junaug']
df.head(5)

# Return a Numpy representation of the DataFrame.
df.values

# Finding out how many rows dataset has.
len(df)
df.shape

# Finding out basic statistical information on your dataset.
pd.options.display.float_format = '{:,.3f}'.format # Limit output to 3 decimal places.
df.describe()

# Constructing DataFrame from numpy ndarray:
df2 = pd.DataFrame(np.random.randint(low=0, high=10, size=(5, 5)),columns=['a', 'b', 'c', 'd', 'e'])

# Create dataframe
df = pd.DataFrame(np.random.randn(8,5))

# create dataframe
col_names =  ['A', 'B', 'C']
my_df  = pd.DataFrame(columns = col_names)

# create from numpy
pd.DataFrame(np_array)

insert

# append 插入一行
df.append(df2, ignore_index=True)

# 插入列, 插入第一列
df.insert(0,'date',date)

# 默认插入到第一列
df['date']=date

# 连续插入一行
 df.loc[-1] = [2, 3, 4]  # adding a row
 df.index = df.index + 1  # shifting index
 df = df.sort_index()  # sorting by index

get data from column & row 数据过滤

# Getting a column by label
df['rain_octsep']

# 获得多列
df[['a', 'b']]

# Getting a column by label using .
df.rain_octsep

# Creating a series of booleans based on a conditional
df.rain_octsep < 1000 # Or df['rain_octsep] < 1000

# Using a series of booleans to filter
df[df.rain_octsep < 1000]

# convert pandas float series to int and filter
df = df[df['hour'].astype(int)==0]

# 多重数据过滤
df[['algo_name', 'rmse']][(df['nout']==2) & (df['dataset']=='beijing_do_s') & (df['filter'].isna())].sort_values(by='rmse')

# 过滤值是NaN的行
df['filter'].isna()

# 类型转换
df['timestamp'] = df['datetime'].astype(int)

# line replaces None with NaN
df['column'].replace('None', np.nan, inplace=True)

# Filtering by multiple conditionals(Can't use the keyword 'and')
df[(df.rain_octsep < 1000) & (df.outflow_octsep < 4000)]

# Filtering by string methods
df[df.water_year.str.startswith('199')]

# Filtering by string methods
df[df.water_year.str.startswith('199')]

# loc gets rows (or columns) with particular labels from the index.
# iloc gets rows (or columns) at particular positions in the index (so it only takes integers).
# ix usually tries to behave like loc but falls back to behaving like iloc if a label is not present in the index.

# Getting a row via a label-based index, 'Andrade' is the index of last_name
df.loc['Andrade']

# Select columns with .loc using the names of the columns
df.loc[['Andrade', 'Veness'], ['first_name', 'address', 'city']]

# To select rows whose column value equals a scalar
df.loc[df['column_name'] == some_value]

# To select rows whose column value does not equal some_value
df.loc[df['column_name'] != some_value]

# If you have multiple values you want to include, put them in a list (or more generally, any iterable) and use isin
df.loc[df['column_name'].isin(['one','three'])]

# Combine multiple conditions with &
df.loc[(df['column_name'] == some_value) & df['other_column'].isin(some_values)]

# isin returns a boolean Series, so to select rows whose value is not in some_values
df.loc[~df['column_name'].isin(some_values)]

# Getting a row via a numerical index
df.iloc[30]

# Getting a columnes
df.iloc[:, 0]

# Getting a row via a label-based or numerical index
df.ix['1999/00'] # Label based with numerical index fallback *Not recommended

# Setting a new index from an existing column
df = df.set_index(['water_year'])
df.head(5)

# Returning an index to data
df = df.reset_index('water_year')
df.head(5)

# remove all duplicates and keep one 去重
df.drop_duplicates()

df = pd.DataFrame({"A":["foo", "foo", "foo", "bar"], "B":[0,1,1,1], "C":["A","A","B","A"]})
df.drop_duplicates(subset=['A', 'C'], keep=last)

# reset index
df = df.reset_index(drop=True)

# or statement
new_df[(new_df['hour'].astype(int)==0) | (new_df['hour'].astype(int)==8) | (new_df['hour'].astype(int)==16)]

# loop iterate over rows in a DataFrame
for index, row in df.iterrows():
    print(row['dateTime'], row['CODMn'])

# tolist
List = df['row1'].tolist()

# get_loc
unique_index = pd.Index(list('abc'))
# 1
unique_index.get_loc('b')

排序

# 指定列名 & 指定行属性 & 按值排序
df[['algo_name', 'nout', 'rmse']][(df['nout']==1) & (df['dataset']=='beijing_do')].sort_values(by='algo_name')

# 根据某一列排序
df.sort_index(axis=1)

# 根据某一行进行排序
df.sort_index()

# 根据某一列值进行排序
df.sort_values(by='a')

s = pd.Series([np.nan, 1, 3, 10, 5])
# series 升序
s.sort_values(ascending=True)
# series 降序
s.sort_values(ascending=False)

修改数据

#修改index为‘1’,column为‘name’的那一个值为aa
df.loc[1,'name'] = 'aa'

#修改index为‘1’的那一行的所有值。
df.loc[1] = ['bb','ff',11]

#修改index为‘1’,column为‘name’的那一个值为bb,age列的值为11。
df.loc[1,['name','age']] = ['bb',11]

#iloc[row_index, column_index]
#修改某一无素
df.iloc[1,2] = 19

##修改一整列
df.iloc[:,2] = [11,22,33]

#修改一整行
df.iloc[0,:] = ['lily','F',15]

#根据条件修改
df.loc[df1['stream'] == 2, ['feat','another_feat']] = 'aaaa'
print df
   stream        feat another_feat
a       1  some_value   some_value
b       2        aaaa         aaaa
c       2        aaaa         aaaa
d       3  some_value   some_value

apply function

# Applying a function to a column
def base_year(year):
    base_year = year[:4]
    base_year= pd.to_datetime(base_year).year
    return base_year

df['year'] = df.water_year.apply(base_year)
df.head(5)

groupby

# Manipulating structure (groupby, unstack, pivot)
# Grouby
df.groupby(df.year // 10 *10).max()

decade_rain = df.groupby([df.year // 10 * 10, df.rain_octsep // 1000 * 1000])[['outflow_octsep','outflow_decfeb','outflow_junaug']].mean()
decade_rain

# View Groups
ipl_data = {'Team': ['Riders', 'Riders', 'Devils', 'Devils', 'Kings',
         'kings', 'Kings', 'Kings', 'Riders', 'Royals', 'Royals', 'Riders'],
         'Rank': [1, 2, 2, 3, 3,4 ,1 ,1,2 , 4,1,2],
         'Year': [2014,2015,2014,2015,2014,2015,2016,2017,2016,2014,2015,2017],           
         'Points':[876,789,863,673,741,812,756,788,694,701,804,690]}
df = pd.DataFrame(ipl_data)

print df.groupby('Team').groups

{'Kings': Int64Index([4, 6, 7],      dtype='int64'),
'Devils': Int64Index([2, 3],         dtype='int64'),
'Riders': Int64Index([0, 1, 8, 11],  dtype='int64'),
'Royals': Int64Index([9, 10],        dtype='int64'),
'kings' : Int64Index([5],            dtype='int64')}

# multiple columns
print df.groupby(['Team','Year']).groups

#{('Kings', 2014): Int64Index([4], dtype='int64'),
#('Royals', 2014): Int64Index([9], dtype='int64'),
#('Riders', 2014): Int64Index([0], dtype='int64'),
#('Riders', 2015): Int64Index([1], dtype='int64'),
#('Kings', 2016): Int64Index([6], dtype='int64'),
#('Riders', 2016): Int64Index([8], dtype='int64'),
#('Riders', 2017): Int64Index([11], dtype='int64'),
#('Devils', 2014): Int64Index([2], dtype='int64'),
#('Devils', 2015): Int64Index([3], dtype='int64'),
#('kings', 2015): Int64Index([5], dtype='int64'),
#('Royals', 2015): Int64Index([10], dtype='int64'),

# Iterating through Groups
grouped = df.groupby('Year')

for name,group in grouped:
    print name
    print group

2014
   Points  Rank     Team   Year
0     876     1   Riders   2014
2     863     2   Devils   2014
4     741     3   Kings    2014
9     701     4   Royals   2014

2015
   Points  Rank     Team   Year
1     789     2   Riders   2015
3     673     3   Devils   2015
5     812     4    kings   2015
10    804     1   Royals   2015

2016
   Points  Rank     Team   Year
6     756     1    Kings   2016
8     694     2   Riders   2016

2017
   Points  Rank    Team   Year
7     788     1   Kings   2017
11    690     2  Riders   2017

# Select a Group
grouped = df.groupby('Year')
print grouped.get_group(2014)

   Points  Rank     Team    Year
0     876     1   Riders    2014
2     863     2   Devils    2014
4     741     3   Kings     2014
9     701     4   Royals    2014

# Aggregations
grouped = df.groupby('Year')
print(grouped['Points'].agg(np.mean))

Year
2014   795.25
2015   769.50
2016   725.00
2017   739.00
Name: Points, dtype: float64

COMBINING DATASETS

rain_jpn = pd.read_csv('notebook_playground/data/jpn_rain.csv')
rain_jpn.columns = ['year', 'jpn_rainfall']

uk_jpn_rain = df.merge(rain_jpn, on='year')
uk_jpn_rain.head(5)

# pd.merge
result = pd.merge(user_usage, user_device[['use_id', 'platform', 'device']], on='use_id')

append & concat

result = pd.concat(frames)
result = df1.append(df4, ignore_index=True)

images

concat

result = pd.concat([df1, df4], axis=1, sort=False)

images

SAVING YOUR DATASETS

# Saving your data to a csv
df.to_csv('uk_rain.csv',index=False)

data cleaning

Add default values

#replace nan with an empty string or some other default value.
data.country = data.country.fillna('')

#a calculation like taking the mean duration can help us even the dataset out. It’s not a great measure, but it’s an estimate of what the duration could be based on the other data.
data.duration = data.duration.fillna(data.duration.mean())

move column

a
#   x1  x2
#0   0   5
#1   1   6
#2   2   7
#3   3   8
#4   4   9

a.x2 = a.x2.shift(1)
#   x1  x2
#0   0 NaN
#1   1   5
#2   2   6
#3   3   7
#4   4   8

Remove rows & columns

#Drop columns
df.drop(['B', 'C'], axis=1)
df.drop(columns=['B', 'C'])

#Drop a row by index
df.drop([0, 1])

#Dropping all rows with any NA values is easy:
data.dropna()

#we can also drop rows that have all NA value
data.dropna(how=’all’)

#We can also put a limitation on how many non-null values need to be in a row in order to keep it (in this example, the data needs to have at least 5 non-null values):
data.dropna(thresh=5)

#Let’s say for instance that we don’t want to include any movie that doesn’t have information on when the movie came out:
data.dropna(subset=[‘title_year’])

#update data with drop nan
data = data.dropna(subset=[water_column])

#Drop the columns with that are all NA values:
data.dropna(axis=1, how=’all’)

#Drop all columns with any NA values:
data.dropna(axis=1, how=’any’)

时间日期处理datetime

# selecting DataFrame rows between two dates 
start = datetime.datetime(2014,4,2)
end = datetime.datetime(2017,12,15,4)
base_df['dateTime'] = pd.to_datetime(base_df['dateTime'])
base_df = base_df.set_index(['dateTime'])
new_df = base_df.loc[start:end]
# or
df = pd.DataFrame(np.random.random((200,3)))
df['date'] = pd.date_range('2000-1-1', periods=200, freq='D')
mask = (df['date'] > '2000-6-1') & (df['date'] <= '2000-6-10')
print(df.loc[mask])
# or
df = df[(df['date'] > '2000-6-1') & (df['date'] <= '2000-6-10')]

# 转换成时间戳
cod_df['datetime'] = pd.to_datetime(cod_df["datetime"]).astype(int)/10**9

# datetime format
df = pd.DataFrame({"Date": ["26-12-2007", "27-12-2007", "28-12-2007"]})
df["Date"] = pd.to_datetime(df["Date"]).dt.strftime('%Y-%m-%d')
print(df)

# 时间范围range
DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 01:00:00',
               '2018-01-01 02:00:00'],
              dtype='datetime64[ns]', freq='H')
dti = pd.date_range('2018-01-01', periods=3, freq='H')

DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
               '2018-04-27 00:00:00'],
              dtype='datetime64[ns]', freq=None)
pd.date_range(start='2018-04-24', end='2018-04-27', periods=3)


# Timedelta
# Timedelta('1 days 02:00:00')
ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki')

# Respects absolute time
# Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki')
ts + pd.Timedelta(days=1)
# Respects calendar time
# Timestamp('2016-10-31 00:00:00+0200', tz='Europe/Helsinki')
ts + pd.DateOffset(days=1)

           A      B          C
# 0 2018-01-01 0 days 2018-01-01
# 1 2018-01-02 1 days 2018-01-03
# 2 2018-01-03 2 days 2018-01-05
s = pd.Series(pd.date_range('2018-1-1', periods=3, freq='D'))
td = pd.Series([ pd.Timedelta(days=i) for i in range(3) ])
df = pd.DataFrame(dict(A = s, B = td))
df['C']=df['A']+df['B']
           A      B          C
# 0 2012-01-01 0 days 2012-01-01
# 1 2012-01-02 1 days 2012-01-03
# 2 2012-01-03 2 days 2012-01-05


# This particular day contains a day light savings time transition
ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki')

# Respects absolute time
# Timestamp('2016-10-30 23:00:00+0200', tz='Europe/Helsinki')
ts + pd.Timedelta(days=1)

# 获取不同时间
df['hour'] = df['date'].dt.hour
df['dayofweek'] = df['date'].dt.dayofweek
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['dayofyear'] = df['date'].dt.dayofyear
df['dayofmonth'] = df['date'].dt.day
df['weekofyear'] = df['date'].dt.weekofyear

设定Timestamp
>>> pd.Timestamp(year=2017, month=1, day=1, hour=12)
Timestamp('2017-01-01 12:00:00')

# 设定时间
df['dateTime'].iloc[1] = df['dateTime'].iloc[1].replace(minute=0)

# DateOffset
ts = pd.Timestamp('2014-01-01 09:00')
day = pd.offsets.Day()
# Timestamp('2014-01-02 09:00:00')
day.apply(ts)

d = datetime.datetime(2008, 8, 18, 9, 0)
# Timestamp('2008-08-11 09:00:00')
d - pd.offsets.Week()

pandas.series

# Start by creating a series with 9 one minute timestamps.
>>> index = pd.date_range('1/1/2000', periods=9, freq='T')
>>> series = pd.Series(range(9), index=index)
>>> series
2000-01-01 00:00:00    0
2000-01-01 00:01:00    1
2000-01-01 00:02:00    2
2000-01-01 00:03:00    3
2000-01-01 00:04:00    4
2000-01-01 00:05:00    5
2000-01-01 00:06:00    6
2000-01-01 00:07:00    7
2000-01-01 00:08:00    8
Freq: T, dtype: int64

# Downsample the series into 3 minute bins and sum the values of the timestamps falling into a bin.
>>> series.resample('3T').sum()
2000-01-01 00:00:00     3
2000-01-01 00:03:00    12
2000-01-01 00:06:00    21
Freq: 3T, dtype: int64

# 数据重采样,从每分钟变为每天
# load the new file
dataset = read_csv('household_power_consumption.csv', header=0, infer_datetime_format=True, parse_dates=['datetime'], index_col=['datetime'])
# resample data to daily
daily_groups = dataset.resample('D')
daily_data = daily_groups.sum()

# Convert a NumPy array to a Pandas series
np_array = np.array([10, 20, 30, 40, 50])
new_series = pd.Series(np_array)

# Convert series to numpy array
np_array = series.values

data analysis

# 时间序列自相关
series = read_csv('shampoo-sales.csv', header=0, parse_dates=[0], index_col=0, squeeze=True, date_parser=parser)
autocorrelation_plot(series)
pyplot.show()

setting

# display all text in a cell without truncation
pd.set_option('display.max_colwidth', -1)

plot

df.set_index('dateTime').plot(title='title content')

excel

import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile

df = pd.read_excel('File.xlsx', sheetname='Sheet1')

#Column headings
print(df.columns)

sepalWidth = df['Sepal width']

#Don't forget to add '.xlsx' at the end of the path
export_excel = df.to_excel (r'C:\Users\Ron\Desktop\export_dataframe.xlsx', index = None, header=True)

HDF file

# save hdf
df.to_hdf('data.h5', key='df', mode='w')

# read hdf
pd.read_hdf('data.h5', 'df')