본문 바로가기

Data Science/Pandas

[pandas] Useful personal function for EDA

1. Check missing records

def missing(df) : 
    missing_number = df.isnull().sum().sort_values(ascending = False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending = False)
    missing_values = pd.concat([missing_number, missing_percent], axis = 1, keys = ['Missing_number', 'Missing_percent'])
    return missing_values

 

2. Grouping columns by its feature

def categorize(df) :
    Quantitive_features = df.select_dtypes([np.number]).columns.tolist()
    Categorical_features = df.select_dtypes(exclude = [np.number]).columns.tolist()
    Discrete_features = [col for col in Quantitive_features if len(df[col].unique()) < 10]
    Continuous_features = [col for col in Quantitive_features if col not in Discrete_features]
    print(f"Quantitive feautres : {Quantitive_features} \n
            Discrete features : {Discrete_features} \n
            Continous features : {Continuous_features} \n
            Categorical features : {Categorical_features}\n")
    print(f"Number of quantitive feautres : {len(Quantitive_features)} \n
            Number of discrete features : {len(Discrete_features)} \n
            Number of continous features : {len(Continuous_features)} \n
            Number of categorical features : {len(Categorical_features)}")

 

3. Check unique values in each columns

def unique(df) : 
    """
    This function returns table storing number of unique values and its samples.
    """
    tb1 = pd.DataFrame({'Columns' : df.columns, 'Number_of_Unique' : df.nunique().values.tolist(),
                       'Sample1' : df.sample(1).values.tolist()[0], 'Sample2' : df.sample(1).values.tolist()[0], 
                       'Sample3' : df.sample(1).values.tolist()[0],
                       'Sample4' : df.sample(1).values.tolist()[0], 'Sample5' : df.sample(1).values.tolist()[0]})
    return tb1

 

4. Preprocess multiple files

def processing_dataframe(filename) : 
    
    doc = pd.read_excel(PATH + filename)
    
    # Drop value means count 
    
    doc_drop = doc[(doc['대계열'] == '총계') | (doc['중계열'] == '계') | (doc['소계열'] == '계')]
    doc.drop(doc_drop.index, inplace = True)
    
    # Column selection
    doc = doc[['대계열', '중계열', '소계열', '전체']]
    
    # Column processing
    doc['전체'].fillna(0, inplace = True)
    doc['전체'] = doc['전체'].astype('int64')
    year_col = filename.split("_")[0]
    doc.rename(columns = {'전체' : year_col}, inplace = True)
    
    return doc 
    
def generate_dataframe_by_path(PATH) :
    
    file_list = os.listdir(PATH)
    first_doc = True
    file_list.sort()
    
    for file in file_list : 
        doc = processing_dataframe(file) 
        if first_doc : 
            final_doc, first_doc = doc, False
        else : 
            final_doc = pd.merge(final_doc, doc, how = 'outer')
    
    return final_doc

 

5. Basic EDA personal function

def basic_EDA(df) :   
    
    # Dataset preview 
    print("1. Dataset Preview \n")
    display(df.head())
    print("-------------------------------------------------------------------------------\n")
    
    # Columns imformation
    print("2. Column Information \n")
    print(f"Dataset have {df.shape[0]} columns and {df.shape[1]} rows")
    print("\n") 
    print(f"Dataset Column name : {df.columns.values}")
    print("\n")
    categorize(df)
    print("-------------------------------------------------------------------------------\n")
    
    # Basic imformation table 
    print("3. Missing data table : \n")
    display(missing(df))
    print("-------------------------------------------------------------------------------\n")
    
    print("4. Number of unique value by column : \n")
    display(unique(df))
    print("-------------------------------------------------------------------------------\n")
    
    print("5. Describe table : \n")
    display(df.describe())
    print("-------------------------------------------------------------------------------\n")
    
    print(df.info())
    print("-------------------------------------------------------------------------------\n")

'Data Science > Pandas' 카테고리의 다른 글

[pandas] Optimizing DataFrame's Memory  (0) 2022.10.11
[pandas] Introduction to Pandas  (0) 2022.10.04
[pandas] Basic Data Exploration  (0) 2022.09.19
[pandas] Cut rows based on integer  (0) 2022.09.18
[pandas] Set options  (0) 2022.09.18