Source code for buildml.eda._eda

import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling as pp
import sweetviz as sv

__author__ = "TechLeo"
__email__ = "techleo.ng@outlook.com"
__copyright__ = "Copyright (c) 2023 TechLeo"
__license__ = "MIT"

[docs] def eda(data): data.info() print("\n\n") data_head = data.head() data_tail = data.tail() data_descriptive_statistic = data.describe() data_more_descriptive_statistics = data.describe(include = "all") data_mode = data.mode() data_distinct_count = data.nunique() data_null_count = data.isnull().sum() data_total_null_count = data.isnull().sum().sum() data_correlation_matrix = data.corr() return {"Dataset": data, "Data_Head": data_head, "Data_Tail": data_tail, "Data_Descriptive_Statistic": data_descriptive_statistic, "Data_More_Descriptive_Statistic": data_more_descriptive_statistics, "Data_Mode": data_mode, "Data_Distinct_Count": data_distinct_count, "Data_Null_Count": data_null_count, "Data_Total_Null_Count": data_total_null_count, "Data_Correlation_Matrix": data_correlation_matrix}
[docs] def eda_visual(data, y: str, histogram_bins: int = 10, figsize_heatmap: tuple = (15, 10), figsize_histogram: tuple = (15, 10), figsize_barchart: tuple = (15, 10), before_data_cleaning: bool = True): if before_data_cleaning == False: data_histogram = data.hist(figsize = figsize_histogram, bins = histogram_bins) plt.figure(figsize = figsize_heatmap) data_heatmap = sns.heatmap(data.corr(), cmap = "coolwarm", annot = True, fmt=".2f") plt.title('Correlation Matrix') plt.show() elif before_data_cleaning == True: # Visualize the distribution of categorical features categorical_features = data.select_dtypes(include = "object").columns for feature in categorical_features: plt.figure(figsize=figsize_barchart) sns.countplot(x=feature, data = data) plt.title(f'Distribution of {feature}') plt.show() data_histogram = data.hist(figsize = figsize_histogram, bins = histogram_bins) plt.figure(figsize = figsize_heatmap) data_heatmap = sns.heatmap(data.corr(), cmap = "coolwarm", annot = True, fmt=".2f") plt.title('Correlation Matrix') plt.show()
[docs] def sweet_viz(dataset, filename: str, auto_open: bool = False): report1 = sv.analyze(dataset) report1.show_html(filepath = filename, open_browser = auto_open)
[docs] def pandas_profiling(dataset, output_file: str = "Pandas Profile Report.html", dark_mode: bool = False, title: str = "Report"): report = pp.ProfileReport(df = dataset, dark_mode = dark_mode, explorative = True, title = title) report.to_widgets() report.to_file(output_file = output_file)