跳转至

VerticaPy ML 速查表(v0.10.1)

原文:VerticaPy Machine Learning V0.10.1 Cheat Sheet | PDF

VerticaPy 通过 Python 接口支持完整的机器学习工作流。详见 VerticaPy 文档示例

一、数据预处理

from verticapy.utilities import *
import verticapy as vp

# --- 加载数据 ---
VDataFrame = vp.read_csv("filename.csv")

# --- 统计摘要 ---
VDataFrame.describe()
VDataFrame.describe(columns=["col1", "col2"], method="categorical")

# --- 异常值检测 ---
VDataFrame.outliers_plot(["col1", "col2"])                           # 2D 可视化
VDataFrame.outliers(columns=["col1", "col2"], name="outlier_col")     # 标注异常列

# --- 相关性 ---
VDataFrame.corr(method="pearson")                                     # 全矩阵
VDataFrame.corr(["col1", "col2"], method="spearman")                  # 两列间

# --- 归一化 ---
VDataFrame.normalize()                                                # 默认 zscore
VDataFrame.normalize(columns=["col1", "col2"], method="minmax")       # min-max

# --- PCA 降维 ---
from verticapy.learn.decomposition import PCA
model = PCA("PCA_name")
model.fit(VDataFrame)
model.transform(n_components=2)

# --- 分类变量编码 ---
VDataFrame.label_encode()                   # 标签编码
VDataFrame["col"].one_hot_encode()          # One-hot 编码
VDataFrame["col"].mean_encode()             # 均值编码

# --- 缺失值填充 ---
VDataFrame.count_percent()                  # 各列缺失比例
VDataFrame["col"].fillna(method="auto")     # 自动:数值→均值,分类→众数
VDataFrame["col"].fillna(method="avg", by=["partition_col"])  # 按分组均值

# --- 不平衡数据处理 ---
VDataFrame.balance(column=["target"])                              # 默认 hybrid
VDataFrame.balance(column=["target"], method="under", x=0.5)      # 自定义比例
VDataFrame["target"].topk(k=3)                                     # 查看值分布

# --- 采样 ---
VDataFrame.sample(x=0.2)                     # 按比例随机
VDataFrame.sample(n=100)                     # 按数量
VDataFrame.sample(x=0.3, method="stratified") # 分层采样

二、模型构建

回归

# 线性回归
from verticapy.learn.linear_model import LinearRegression
model = LinearRegression(name="public.model_name")

# SVM 回归
from verticapy.learn.svm import LinearSVR
model = LinearSVR(name="model", acceptable_error_margin=0.5)

# 随机森林
from verticapy.learn.ensemble import RandomForestRegressor
model = RandomForestRegressor(name="model", n_estimators=20,
    max_features="auto", max_leaf_nodes=32, sample=0.7,
    max_depth=3, min_sample_leaf=5, min_info_gain=0.0, nbins=32)

# XGBoost
from verticapy.learn.ensemble import XGBoostRegressor
model = XGBoostRegressor(name="model", max_ntree=10, max_depth=5, nbins=32,
    objective="squarederror", split_proposal_method="global",
    learning_rate=0.1, min_split_loss=0, weight_reg=0, sample=1)

# AutoML
from verticapy.learn.delphi import AutoML
model = AutoML(name="model", estimator_type="regressor", cv=3, stepwise=True)

分类

# 逻辑回归
from verticapy.learn.linear_model import LogisticRegression
model = LogisticRegression(name="model", penalty="L2", tol=1e-4,
    C=1, max_iter=100, solver="CGD")

# SVM 分类
from verticapy.learn.svm import LinearSVC
model = LinearSVC(name="model", tol=1e-4, C=1.0,
    fit_intercept=True, intercept_model="regularized", max_iter=100)

# 随机森林分类
from verticapy.learn.ensemble import RandomForestClassifier
model = RandomForestClassifier(name="model", n_estimators=20, max_depth=3, ...)

# XGBoost 分类
from verticapy.learn.ensemble import XGBoostClassifier
model = XGBoostClassifier(name="model", max_ntree=10, max_depth=5, ...)

# AutoML 分类
model = AutoML(name="model", estimator_type="multi", cv=3, stepwise=True)

聚类

# K-Neighbors
from verticapy.learn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(name="model", n_neighbors=5, p=2)

# Nearest Centroid
from verticapy.learn.neighbors import NearestCentroid
model = NearestCentroid(name="model", p=2)

三、训练与预测

# 训练
model.fit(VDataFrame, ["x1", "x2", "x3"], "y")

# 预测并添加到 VDataFrame
model.predict(VDataFrame, X=["x1", "x2", "x3"], name="pred_col")

回归评估指标

指标 代码
MSE model.score("mse")
model.score("r2")
Adjusted R² model.score("r2a")
RMSE model.score("rmse")
MAE model.score("mae")
Max Error model.score("max")
AIC model.score("aic")
BIC model.score("bic")
Explained Variance model.score("var")

分类评估

# 混淆矩阵
model.confusion_matrix(pos_label="Label", cutoff=0.33)

# Lift Chart
from verticapy.learn.model_selection import lift_chart
lift_chart("Response", "Probability", VDataFrame)

# ROC Curve
model.roc_curve(nbins=12)

四、模型管理

memModel(从属性构建模型)

from verticapy.learn.memmodel import memModel

# 线性回归示例
model = memModel(model_type="LinearRegression",
    attributes={"coefficients": [0.5, 1.2], "intercept": 2})

# 生成 SQL 部署代码
model.predict_sql(["x1", "x2"])

导出 SQL

# 生成完整的 SQL 部署代码
model.to_sql()

扩展阅读