最近发现一个非常不错的Python可视化库,VizPool。
低代码,只需几行代码即可实现多种数据可视化图表的制作。
而且还能用来制作机器学习模型,以及指标评估。
相关文档地址如下
https://github.com/Hassi34/VizPool
https://jovian.ai/hasnainmehmood3435/vizpool-static-api
可以使用pip安装使用。
pip install vizpool
安装好以后,导入相关库。
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
加载数据,进行数据探索分析工作。
其中tips.csv可以在【GitHub】上获取。
https://github.com/mwaskom/seaborn-data/blob/master/tips.csv
其中数据含义如下。
total_bill:消费总金额
tip:小费金额
sex:性别
smoker:是否吸烟
day:消费日期
time:消费时段
size:聚餐人数
下面就来看看图表示例吧。
01. 饼图条形图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 饼图条形图
plt = tips_eda.pie_bar(hue='sex');
plt.savefig("Pie_bar.png")
02. 箱线图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 所有数值列的箱线图网格
plt = tips_eda.boxplot(height=3, width=10)
plt.savefig("Box_all.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 指定分类的所有数值列的箱线图网格
plt = tips_eda.boxplot(hue='sex', height=6)
plt.savefig("Box_sex.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 特定数值列的箱线图
plt = tips_eda.boxplot(col_to_plot=['total_bill'], width=4, height=3)
plt.savefig("Box_total_bill.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 指定分类的任何数值的箱线图
plt = tips_eda.boxplot(col_to_plot=['total_bill'], hue='sex', width=4, height=3)
plt.savefig("Box_sex_total_bill.png")
03. 计数图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 数据中所有非数字列的计数图网格
plt = tips_eda.countplot()
plt.savefig("Count.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 单个分类列的计数图
plt = tips_eda.countplot(['sex'], height=4, width=4)
plt.savefig("Count_sex.png")
04. 直方图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 所有数值列的直方图网格
plt = tips_eda.histogram(height=7)
plt.savefig("Histogram.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 直方图, 其中分类列作为关键字参数传递给hue
plt = tips_eda.histogram(hue='sex', height=7)
plt.savefig("Histogram_sex.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 特定数值列的直方图
plt = tips_eda.histogram(col_to_plot=['total_bill'], height=4, width=6)
plt.savefig("Histogram_total_bill.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 指定分类的特定数值列的直方图
plt = tips_eda.histogram(col_to_plot=['total_bill'], hue='sex', height=4, width=6)
plt.savefig("Histogram_sex_total_bill.png")
05. 柱状图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 所有分类列相对于所提供的数字列的条形图网格
plt = tips_eda.barplot(y='tip', estimator='sum')
plt.savefig("Bar.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 针对所提供的数字列的所有分类列的条形图网格,色调设置为分类列
plt = tips_eda.barplot(y='tip', hue='time')
plt.savefig("Bar_time.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 选取单变量的柱状图
plt = tips_eda.barplot(y='tip', col_to_plot='smoker', hue='time', height=3, width=4)
plt.savefig("Bar_single.png")
06. 小提琴图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 针对目标列或分类列的所有数值列的Violinplot作为关键字参数传递
plt = tips_eda.violinplot(hue='sex', height=7)
plt.savefig("Violin.png")
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 作为关键字参数传递的针对目标列或分类列的选择性数值列的Violinplot
plt = tips_eda.violinplot(col_to_plot=['total_bill', 'tip'], hue='sex', height=5)
plt.savefig("Violin_col.png")
07. 热力图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 热力图
plt = tips_eda.corr_heatmap(height=5, width=5)
plt.savefig("Correlation_Heatmap.png")
08. 联合图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 两个数值变量的联合图
plt = tips_eda.jointplot(x='total_bill',
y='tip',
height=5, width=5,
color='green')
plt.savefig("Joint.png")
09. 特征图
import pandas as pd
from vizpool.static import EDA
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# EDA实例化(Exploratory Data Analysis)
tips_eda = EDA(df)
# 包含作为关键字参数传递的分类列的所有数值特征的成对图
plt = tips_eda.pairplot(hue='sex', height=5, width=8)
plt.savefig("Pair.png")
10. 模型训练和评估
选择列数据, 并尝试预测这个人是否吸烟。
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from vizpool.static import Evaluation
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# 使用sklearn机器学习管道获取特征重要性
pipeline_data = df[["total_bill", "tip", "size", "sex", "smoker"]]
target_class_labels = pipeline_data['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(pipeline_data.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(pipeline_data, target, test_size=0.2, random_state=42)
# 实例化评估类
model_eval = Evaluation(y_val)
col_trans = ColumnTransformer(transformers=[
('num_processing', StandardScaler(), ["total_bill", "tip", "size"]),
('cat_processing', OneHotEncoder(), ['sex'])
], remainder='drop')
pipe_rfc = Pipeline(steps=[
('tranformer', col_trans),
('classifier', RandomForestClassifier())
])
pipe_rfc.fit(X_train, y_train)
plt = model_eval.feature_importance(pipe_rfc, X_val=X_val, pipeline=True,
height=6, width=16)
plt.savefig("RandomForestClassifier.png")
import pandas as pd
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import ExtraTreeClassifier
from vizpool.static import Evaluation
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# 使用sklearn机器学习管道获取特征重要性
pipeline_data = df[["total_bill", "tip", "size", "sex", "smoker"]]
target_class_labels = pipeline_data['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(pipeline_data.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(pipeline_data, target, test_size=0.2, random_state=42)
# 实例化评估类
model_eval = Evaluation(y_val)
# 用估计器获得特征的重要性
df = df[["total_bill", "tip", "size", "smoker"]]
target_class_labels = df['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(df.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(df, target, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape
logistic_reg_clf = LogisticRegression()
logistic_reg_clf.fit(X_train, y_train)
logistic_predictions = logistic_reg_clf.predict(X_val)
etc = ExtraTreeClassifier()
etc.fit(X_train, y_train)
etc_predictions = etc.predict(X_val)
svc = SVC(probability=True)
svc.fit(X_train, y_train)
svc_predictions = svc.predict(X_val)
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_predictions = sgd.predict(X_val)
plt = model_eval.feature_importance(etc, X_val=X_val,
height=6, width=16)
plt.savefig("ExtraTreeClassifier.png")
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from vizpool.static import Evaluation
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# 使用sklearn机器学习管道获取特征重要性
pipeline_data = df[["total_bill", "tip", "size", "sex", "smoker"]]
target_class_labels = pipeline_data['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(pipeline_data.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(pipeline_data, target, test_size=0.2, random_state=42)
# 实例化评估类
model_eval = Evaluation(y_val)
# 用估计器获得特征的重要性
df = df[["total_bill", "tip", "size", "smoker"]]
target_class_labels = df['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(df.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(df, target, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape
logistic_reg_clf = LogisticRegression()
logistic_reg_clf.fit(X_train, y_train)
logistic_predictions = logistic_reg_clf.predict(X_val)
# 带比率的混淆矩阵
plt = model_eval.confusion_matrix(logistic_predictions,
target_names=target_class_labels,
height=3, width=4)
plt.savefig("predictions_rate.png")
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from vizpool.static import Evaluation
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# 使用sklearn机器学习管道获取特征重要性
pipeline_data = df[["total_bill", "tip", "size", "sex", "smoker"]]
target_class_labels = pipeline_data['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(pipeline_data.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(pipeline_data, target, test_size=0.2, random_state=42)
# 实例化评估类
model_eval = Evaluation(y_val)
# 用估计器获得特征的重要性
df = df[["total_bill", "tip", "size", "smoker"]]
target_class_labels = df['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(df.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(df, target, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape
logistic_reg_clf = LogisticRegression()
logistic_reg_clf.fit(X_train, y_train)
logistic_predictions = logistic_reg_clf.predict(X_val)
# 带计数的混淆矩阵
plt = model_eval.confusion_matrix(logistic_predictions, target_names=target_class_labels,
height=3, width=4, normalize=False)
plt.savefig("predictions_count.png")
import pandas as pd
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.tree import ExtraTreeClassifier
from vizpool.static import Evaluation
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# 使用sklearn机器学习管道获取特征重要性
pipeline_data = df[["total_bill", "tip", "size", "sex", "smoker"]]
target_class_labels = pipeline_data['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(pipeline_data.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(pipeline_data, target, test_size=0.2, random_state=42)
# 实例化评估类
model_eval = Evaluation(y_val)
# 用估计器获得特征的重要性
df = df[["total_bill", "tip", "size", "smoker"]]
target_class_labels = df['smoker'].unique().tolist()
target = LabelEncoder().fit_transform(df.pop('smoker'))
X_train, X_val, y_train, y_val = train_test_split(df, target, test_size=0.2, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape
logistic_reg_clf = LogisticRegression()
logistic_reg_clf.fit(X_train, y_train)
logistic_predictions = logistic_reg_clf.predict(X_val)
etc = ExtraTreeClassifier()
etc.fit(X_train, y_train)
etc_predictions = etc.predict(X_val)
svc = SVC(probability=True)
svc.fit(X_train, y_train)
svc_predictions = svc.predict(X_val)
sgd = SGDClassifier()
sgd.fit(X_train, y_train)
sgd_predictions = sgd.predict(X_val)
# ROC曲线
plt = model_eval.auc_roc_plot(X_val, [sgd, svc, logistic_reg_clf, etc],
['SGDClassifier', 'SVC', 'Logistic Regression', 'ExtraTreeClf'],
height=5, width=8)
plt.savefig("ROC.png")
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from vizpool.static import Evaluation
# 加载数据集
df = pd.read_csv("seaborn-data/tips.csv")
# 残差图
# 训练线性回归模型
lr = LinearRegression()
X = df[['size', 'total_bill']]
y = df['tip']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
lr.fit(X_train, y_train)
y_predicted = lr.predict(X_val)
# 模型评估
lr_model_eval = Evaluation(y_val)
plt = lr_model_eval.residplot(y_predicted=y_predicted, color='red')
plt.savefig("predicted.png")
页面更新:2024-03-19
本站资料均由网友自行发布提供,仅用于学习交流。如有版权问题,请与我联系,QQ:4156828
© CopyRight 2008-2024 All Rights Reserved. Powered By bs178.com 闽ICP备11008920号-3
闽公网安备35020302034844号