1. Check missing records
def missing(df) :
missing_number = df.isnull().sum().sort_values(ascending = False)
missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending = False)
missing_values = pd.concat([missing_number, missing_percent], axis = 1, keys = ['Missing_number', 'Missing_percent'])
return missing_values
2. Grouping columns by its feature
def categorize(df) :
Quantitive_features = df.select_dtypes([np.number]).columns.tolist()
Categorical_features = df.select_dtypes(exclude = [np.number]).columns.tolist()
Discrete_features = [col for col in Quantitive_features if len(df[col].unique()) < 10]
Continuous_features = [col for col in Quantitive_features if col not in Discrete_features]
print(f"Quantitive feautres : {Quantitive_features} \n
Discrete features : {Discrete_features} \n
Continous features : {Continuous_features} \n
Categorical features : {Categorical_features}\n")
print(f"Number of quantitive feautres : {len(Quantitive_features)} \n
Number of discrete features : {len(Discrete_features)} \n
Number of continous features : {len(Continuous_features)} \n
Number of categorical features : {len(Categorical_features)}")
3. Check unique values in each columns
def unique(df) :
"""
This function returns table storing number of unique values and its samples.
"""
tb1 = pd.DataFrame({'Columns' : df.columns, 'Number_of_Unique' : df.nunique().values.tolist(),
'Sample1' : df.sample(1).values.tolist()[0], 'Sample2' : df.sample(1).values.tolist()[0],
'Sample3' : df.sample(1).values.tolist()[0],
'Sample4' : df.sample(1).values.tolist()[0], 'Sample5' : df.sample(1).values.tolist()[0]})
return tb1
4. Preprocess multiple files
def processing_dataframe(filename) :
doc = pd.read_excel(PATH + filename)
# Drop value means count
doc_drop = doc[(doc['대계열'] == '총계') | (doc['중계열'] == '계') | (doc['소계열'] == '계')]
doc.drop(doc_drop.index, inplace = True)
# Column selection
doc = doc[['대계열', '중계열', '소계열', '전체']]
# Column processing
doc['전체'].fillna(0, inplace = True)
doc['전체'] = doc['전체'].astype('int64')
year_col = filename.split("_")[0]
doc.rename(columns = {'전체' : year_col}, inplace = True)
return doc
def generate_dataframe_by_path(PATH) :
file_list = os.listdir(PATH)
first_doc = True
file_list.sort()
for file in file_list :
doc = processing_dataframe(file)
if first_doc :
final_doc, first_doc = doc, False
else :
final_doc = pd.merge(final_doc, doc, how = 'outer')
return final_doc
5. Basic EDA personal function
def basic_EDA(df) :
# Dataset preview
print("1. Dataset Preview \n")
display(df.head())
print("-------------------------------------------------------------------------------\n")
# Columns imformation
print("2. Column Information \n")
print(f"Dataset have {df.shape[0]} columns and {df.shape[1]} rows")
print("\n")
print(f"Dataset Column name : {df.columns.values}")
print("\n")
categorize(df)
print("-------------------------------------------------------------------------------\n")
# Basic imformation table
print("3. Missing data table : \n")
display(missing(df))
print("-------------------------------------------------------------------------------\n")
print("4. Number of unique value by column : \n")
display(unique(df))
print("-------------------------------------------------------------------------------\n")
print("5. Describe table : \n")
display(df.describe())
print("-------------------------------------------------------------------------------\n")
print(df.info())
print("-------------------------------------------------------------------------------\n")
'Data Science > Pandas' 카테고리의 다른 글
[pandas] Optimizing DataFrame's Memory (0) | 2022.10.11 |
---|---|
[pandas] Introduction to Pandas (0) | 2022.10.04 |
[pandas] Basic Data Exploration (0) | 2022.09.19 |
[pandas] Cut rows based on integer (0) | 2022.09.18 |
[pandas] Set options (0) | 2022.09.18 |