2

我有一个数据框

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
df= {
    'Gen':['M','M','M','M','F','F','F','F','M','M','M','M','F','F','F','F'],
    'Site':['FRX','FX','FRX','FRX','FRX','FX','FRX','FX','FX','FX','FX','FRX','FRX','FRX','FRX','FRX'],
    'Type':['L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R'],
     'UID':[1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004],
    'color':['R','R','G','G','B','G','B','B','R','G','R','G','B','B','R','G'],
    'Time2':[150.78,162.34,188.53,197.69,208.07,217.76,229.48,139.51,146.87,182.54,189.57,199.97,229.28,244.73,269.91,249.19],
     'Time3':[250.78,262.34,288.53,297.69,308.07,317.7,329.81,339.15,346.87,382.54,369.59,399.97,329.28,347.73,369.91,349.12],
     'Time4':[240.18,232.14,258.53,276.69,338.07,307.74,359.16,339.25,365.87,392.48,399.97,410.75,429.08,448.39,465.15,469.33],
     'Time5':[270.84,282.14,298.53,306.69,318.73,327.47,369.63,389.59,398.75,432.18,449.78,473.55,494.85,509.39,515.52,539.23]
}
df = pd.DataFrame(df,columns = ['Gen','Site','Type','UID','color','Time2','Time3','Time4','Time5'])
df.info()

在此处输入图像描述

我想编写一个接受 adataframe并执行以下操作的函数:

  1. countplots对于具有objectdtype 的列(、GEN和columns的 4 个计数图)SiteTypecolor

  2. boxplot对于具有 floatdtype 的列(4 个箱线图,用于Time2,....,Time5列)

  3. 将图表导出为 pdf 文件 - 每页两个图表

我的尝试:

# I am open to other approaches
def data_explorer(data):
    for col in data.columns:
        # 1. countplots for columns with the object dtype
        if data[col].dtype == 'object':
            sns.countplot(x = col, data = data)
         # 2. boxplots for columns with the float dtype   
        elif data[col].dtype == 'float':
            sns.boxplot(data[col])
            
        else:
            print("skip integer dtype")
         # 3. save the graphs as pdf- 4 graphs per page
       
        plt.savefig('data_exploration.pdf')


请注意:最终输出应该有总共 8 个图表

4

1 回答 1

2
  • 主要问题是图应该作为一个组保存在图中,而不是每列单独保存。
  • figsize=(15, 30)根据需要进行调整。

选项 1:4 个数字,每页 2 个图

  1. 通过 dtype 选择数据框的所有列.select_dtypes
  2. 使用列表推导根据每页的绘图数将列分成块。根据需要调整块大小n
  3. 遍历每组列
  4. 创建一个行数等于每页绘图数的图形
  5. 将绘图添加到图形并保存图形
def data_explorer(df):
    # get object and float data
    dobj = df.select_dtypes(include=['object'])
    dflo = df.select_dtypes(include=['float'])
    
    # split columns into groups of two; two being the plots per page
    n = 2
    cols_obj = [dobj.columns[i:i+n] for i in range(0, len(dobj.columns), n)]
    cols_flo = [dflo.columns[i:i+n] for i in range(0, len(dflo.columns), n)]
    
    # create a figure with two plots for each pair in dobj
    for cols in cols_obj:  # iterate through each group
        fig, axes = plt.subplots(n, 1, figsize=(15, 30))
        for col, ax in zip(cols, axes):
            sns.countplot(data=dobj[[col]], x=col, ax=ax)
        fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
        
    # create a figure with two plots for each pair in dflo
    for cols in cols_flo:  # iterate through each group
        fig, axes = plt.subplots(n, 1, figsize=(15, 30))
        for col, ax in zip(cols, axes):
            sns.boxplot(data=dflo[[col]], x=col, ax=ax)
        fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')


data_explorer(df)

选项 2:2 个数字,每页 4 个图

  1. 通过 dtype 选择数据框的所有列.select_dtypes
  2. 创建一个图形以匹配每页的绘图数,等于每组的总列数。
  3. 将每组列添加到绘图图中,然后保存该图。
def data_explorer(df):
    # get object and float data
    dobj = df.select_dtypes(include=['object'])
    dflo = df.select_dtypes(include=['float'])
    
    # create a figure with two plots for each pair in dobj
    fig, axes = plt.subplots(2, 2, figsize=(20, 30))
    for col, ax in zip(dobj.columns, axes.flat):
        sns.countplot(data=dobj[[col]], x=col, ax=ax)
    fig.savefig(f'data_exploration_{"_".join(dobj.columns)}.pdf')
        
    # create a figure with two plots for each pair in dflo
    fig, axes = plt.subplots(2, 2, figsize=(20, 30))
    for col, ax in zip(dflo.columns, axes.flat):
        sns.boxplot(data=dflo[[col]], x=col, ax=ax)
    fig.savefig(f'data_exploration_{"_".join(dflo.columns)}.pdf')


data_explorer(df)

在此处输入图像描述

在此处输入图像描述

于 2021-11-04T18:30:19.440 回答