VerticaPy ML 速查表(v0.10.1)¶
VerticaPy 通过 Python 接口支持完整的机器学习工作流。详见 VerticaPy 文档 和 示例。
一、数据预处理¶
from verticapy.utilities import *
import verticapy as vp
# --- 加载数据 ---
VDataFrame = vp.read_csv("filename.csv")
# --- 统计摘要 ---
VDataFrame.describe()
VDataFrame.describe(columns=["col1", "col2"], method="categorical")
# --- 异常值检测 ---
VDataFrame.outliers_plot(["col1", "col2"]) # 2D 可视化
VDataFrame.outliers(columns=["col1", "col2"], name="outlier_col") # 标注异常列
# --- 相关性 ---
VDataFrame.corr(method="pearson") # 全矩阵
VDataFrame.corr(["col1", "col2"], method="spearman") # 两列间
# --- 归一化 ---
VDataFrame.normalize() # 默认 zscore
VDataFrame.normalize(columns=["col1", "col2"], method="minmax") # min-max
# --- PCA 降维 ---
from verticapy.learn.decomposition import PCA
model = PCA("PCA_name")
model.fit(VDataFrame)
model.transform(n_components=2)
# --- 分类变量编码 ---
VDataFrame.label_encode() # 标签编码
VDataFrame["col"].one_hot_encode() # One-hot 编码
VDataFrame["col"].mean_encode() # 均值编码
# --- 缺失值填充 ---
VDataFrame.count_percent() # 各列缺失比例
VDataFrame["col"].fillna(method="auto") # 自动:数值→均值,分类→众数
VDataFrame["col"].fillna(method="avg", by=["partition_col"]) # 按分组均值
# --- 不平衡数据处理 ---
VDataFrame.balance(column=["target"]) # 默认 hybrid
VDataFrame.balance(column=["target"], method="under", x=0.5) # 自定义比例
VDataFrame["target"].topk(k=3) # 查看值分布
# --- 采样 ---
VDataFrame.sample(x=0.2) # 按比例随机
VDataFrame.sample(n=100) # 按数量
VDataFrame.sample(x=0.3, method="stratified") # 分层采样
二、模型构建¶
回归¶
# 线性回归
from verticapy.learn.linear_model import LinearRegression
model = LinearRegression(name="public.model_name")
# SVM 回归
from verticapy.learn.svm import LinearSVR
model = LinearSVR(name="model", acceptable_error_margin=0.5)
# 随机森林
from verticapy.learn.ensemble import RandomForestRegressor
model = RandomForestRegressor(name="model", n_estimators=20,
max_features="auto", max_leaf_nodes=32, sample=0.7,
max_depth=3, min_sample_leaf=5, min_info_gain=0.0, nbins=32)
# XGBoost
from verticapy.learn.ensemble import XGBoostRegressor
model = XGBoostRegressor(name="model", max_ntree=10, max_depth=5, nbins=32,
objective="squarederror", split_proposal_method="global",
learning_rate=0.1, min_split_loss=0, weight_reg=0, sample=1)
# AutoML
from verticapy.learn.delphi import AutoML
model = AutoML(name="model", estimator_type="regressor", cv=3, stepwise=True)
分类¶
# 逻辑回归
from verticapy.learn.linear_model import LogisticRegression
model = LogisticRegression(name="model", penalty="L2", tol=1e-4,
C=1, max_iter=100, solver="CGD")
# SVM 分类
from verticapy.learn.svm import LinearSVC
model = LinearSVC(name="model", tol=1e-4, C=1.0,
fit_intercept=True, intercept_model="regularized", max_iter=100)
# 随机森林分类
from verticapy.learn.ensemble import RandomForestClassifier
model = RandomForestClassifier(name="model", n_estimators=20, max_depth=3, ...)
# XGBoost 分类
from verticapy.learn.ensemble import XGBoostClassifier
model = XGBoostClassifier(name="model", max_ntree=10, max_depth=5, ...)
# AutoML 分类
model = AutoML(name="model", estimator_type="multi", cv=3, stepwise=True)
聚类¶
# K-Neighbors
from verticapy.learn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(name="model", n_neighbors=5, p=2)
# Nearest Centroid
from verticapy.learn.neighbors import NearestCentroid
model = NearestCentroid(name="model", p=2)
三、训练与预测¶
# 训练
model.fit(VDataFrame, ["x1", "x2", "x3"], "y")
# 预测并添加到 VDataFrame
model.predict(VDataFrame, X=["x1", "x2", "x3"], name="pred_col")
回归评估指标¶
| 指标 | 代码 |
|---|---|
| MSE | model.score("mse") |
| R² | model.score("r2") |
| Adjusted R² | model.score("r2a") |
| RMSE | model.score("rmse") |
| MAE | model.score("mae") |
| Max Error | model.score("max") |
| AIC | model.score("aic") |
| BIC | model.score("bic") |
| Explained Variance | model.score("var") |
分类评估¶
# 混淆矩阵
model.confusion_matrix(pos_label="Label", cutoff=0.33)
# Lift Chart
from verticapy.learn.model_selection import lift_chart
lift_chart("Response", "Probability", VDataFrame)
# ROC Curve
model.roc_curve(nbins=12)
四、模型管理¶
memModel(从属性构建模型)¶
from verticapy.learn.memmodel import memModel
# 线性回归示例
model = memModel(model_type="LinearRegression",
attributes={"coefficients": [0.5, 1.2], "intercept": 2})
# 生成 SQL 部署代码
model.predict_sql(["x1", "x2"])