df.dtypes.value_counts()
df.select_dtypes(include=[ float64 , int64 ])
import pandas as pddf1 = pd.DataFrame({ a :[0,0,0], b : [1,1,1]})df2 = df1df2[ a ] = df2[ a ] + 1df1.head()
df2 = df1.copy()
from copy import deepcopy
df2 = deepcopy(df1)
level_map = {1: high , 2: medium , 3: low }
df[ c_level ] = df[ c ].map(level_map)
def rule(x, y):
if x == high and y > 10:
return 1
else:
return 0
df = pd.DataFrame({ c1 :[ high , high , low , low ], c2 : [0, 23, 17, 4]})
df[ new ] = df.apply(lambda x: rule(x[ c1 ], x[ c2 ]), axis = 1)
df.head()
df[ maximum ] = df.apply(lambda x: max(x[ c1 ], x[ c2 ]), axis = 1)
df[ maximum ] = df[[ c1 , c2 ]].max(axis =1)
df[ c ].value_counts(
normalize = True:查看每个值出现的频率而不是频次数。
dropna = False: 把缺失值也保留在这次统计中。
sort = False: 将数据按照值来排序而不是按照出现次数排序。
df[‘c].value_counts().reset_index(): 将这个统计表转换成pandas的dataframe并且进行处理。
import pandas as pd
import numpy as np
df = pd.DataFrame({ id : [1,2,3], c1 :[0,0,np.nan], c2 : [np.nan,1,1]})
df = df[[ id , c1 , c2 ]]
df[ num_nulls ] = df[[ c1 , c2 ]].isnull().sum(axis=1)
df.head()
df_filter = df[ ID ].isin([ A001 , C022 ,...])
df[df_filter]
import numpy as np
cut_points = [np.percentile(df[ c ], i) for i in [50, 80, 95]]
df[ group ] = 1
for i in range(3):
df[ group ] = df[ group ] + (df[ c ] < cut_points[i])
print(df[:5].to_csv())