multinomial_logistic.1.py
· 2.7 KiB · Python
Sin formato
import numpy as np
import pandas as pd
def build_design_matrix(x: pd.DataFrame) -> np.ndarray:
"""
在最前面加一列 1 作为截距项
"""
X = x.to_numpy(dtype=float)
intercept = np.ones((X.shape[0], 1), dtype=float)
return np.hstack([intercept, X])
def fit_multinomial_from_probabilities(
x: pd.DataFrame,
prob: pd.DataFrame,
reference_index: int = 4,
):
"""
利用已知类别概率 p,拟合 multinomial logistic:
log(p_k / p_ref) = alpha_k + x^T beta_k
返回:
coef_mat: shape = (p+1, K)
第一行是截距
reference 类对应整列为 0
feature_names
"""
feature_names = list(x.columns)
# 只保留完整输入 + 正概率样本
mask_x = ~x.isna().any(axis=1)
mask_p = np.isfinite(prob[PROB_COLS]).all(axis=1) & (prob[PROB_COLS] > 0).all(axis=1)
mask = mask_x & mask_p
x_fit = x.loc[mask, feature_names]
p_fit = prob.loc[mask, PROB_COLS]
X = build_design_matrix(x_fit) # n x (p+1)
P = p_fit.to_numpy(dtype=float) # n x K
n, d = X.shape
K = P.shape[1]
coef_mat = np.zeros((d, K), dtype=float)
pref = P[:, [reference_index]]
# 对非参考类逐个拟合 log(p_k / p_ref)
for k in range(K):
if k == reference_index:
continue
z = np.log(P[:, k:k+1] / pref).ravel() # n,
theta, residuals, rank, s = np.linalg.lstsq(X, z, rcond=None)
coef_mat[:, k] = theta
return coef_mat, feature_names, mask
def predict_multinomial(
x: pd.DataFrame,
coef_mat: np.ndarray,
feature_names,
):
"""
根据拟合得到的系数矩阵预测 softmax 概率
"""
X = build_design_matrix(x[feature_names].fillna(0.0))
logits = X @ coef_mat # n x K
# 数值稳定
logits = logits - logits.max(axis=1, keepdims=True)
exp_logits = np.exp(logits)
probs = exp_logits / exp_logits.sum(axis=1, keepdims=True)
return pd.DataFrame(probs, columns=PROB_COLS, index=x.index)
def export_coefficients(coef_mat: np.ndarray, feature_names):
"""
导出更直观的参数表
"""
rows = ["Intercept"] + list(feature_names)
coef_df = pd.DataFrame(
coef_mat,
index=rows,
columns=PROB_COLS
)
return coef_df
def main():
data = pd.read_csv("data_2300_skin.csv")
result = pd.read_csv("result_2300_skin.csv")
x = preprocess_genotype(data) # 0,1,2
coef_mat, feature_names, fit_mask = fit_multinomial_from_probabilities(
x=x,
prob=result,
reference_index=4, # 以 PDarktoBlackSkin 为参考类
)
pred = predict_multinomial(x, coef_mat, feature_names)
coef_df = export_coefficients(coef_mat, feature_names)
return x, coef_mat, feature_names
| 1 | import numpy as np |
| 2 | import pandas as pd |
| 3 | |
| 4 | def build_design_matrix(x: pd.DataFrame) -> np.ndarray: |
| 5 | """ |
| 6 | 在最前面加一列 1 作为截距项 |
| 7 | """ |
| 8 | X = x.to_numpy(dtype=float) |
| 9 | intercept = np.ones((X.shape[0], 1), dtype=float) |
| 10 | return np.hstack([intercept, X]) |
| 11 | |
| 12 | |
| 13 | def fit_multinomial_from_probabilities( |
| 14 | x: pd.DataFrame, |
| 15 | prob: pd.DataFrame, |
| 16 | reference_index: int = 4, |
| 17 | ): |
| 18 | """ |
| 19 | 利用已知类别概率 p,拟合 multinomial logistic: |
| 20 | log(p_k / p_ref) = alpha_k + x^T beta_k |
| 21 | |
| 22 | 返回: |
| 23 | coef_mat: shape = (p+1, K) |
| 24 | 第一行是截距 |
| 25 | reference 类对应整列为 0 |
| 26 | feature_names |
| 27 | """ |
| 28 | feature_names = list(x.columns) |
| 29 | |
| 30 | # 只保留完整输入 + 正概率样本 |
| 31 | mask_x = ~x.isna().any(axis=1) |
| 32 | mask_p = np.isfinite(prob[PROB_COLS]).all(axis=1) & (prob[PROB_COLS] > 0).all(axis=1) |
| 33 | mask = mask_x & mask_p |
| 34 | |
| 35 | x_fit = x.loc[mask, feature_names] |
| 36 | p_fit = prob.loc[mask, PROB_COLS] |
| 37 | |
| 38 | X = build_design_matrix(x_fit) # n x (p+1) |
| 39 | P = p_fit.to_numpy(dtype=float) # n x K |
| 40 | |
| 41 | n, d = X.shape |
| 42 | K = P.shape[1] |
| 43 | |
| 44 | coef_mat = np.zeros((d, K), dtype=float) |
| 45 | |
| 46 | pref = P[:, [reference_index]] |
| 47 | |
| 48 | # 对非参考类逐个拟合 log(p_k / p_ref) |
| 49 | for k in range(K): |
| 50 | if k == reference_index: |
| 51 | continue |
| 52 | |
| 53 | z = np.log(P[:, k:k+1] / pref).ravel() # n, |
| 54 | theta, residuals, rank, s = np.linalg.lstsq(X, z, rcond=None) |
| 55 | coef_mat[:, k] = theta |
| 56 | |
| 57 | return coef_mat, feature_names, mask |
| 58 | |
| 59 | |
| 60 | def predict_multinomial( |
| 61 | x: pd.DataFrame, |
| 62 | coef_mat: np.ndarray, |
| 63 | feature_names, |
| 64 | ): |
| 65 | """ |
| 66 | 根据拟合得到的系数矩阵预测 softmax 概率 |
| 67 | """ |
| 68 | X = build_design_matrix(x[feature_names].fillna(0.0)) |
| 69 | logits = X @ coef_mat # n x K |
| 70 | |
| 71 | # 数值稳定 |
| 72 | logits = logits - logits.max(axis=1, keepdims=True) |
| 73 | exp_logits = np.exp(logits) |
| 74 | probs = exp_logits / exp_logits.sum(axis=1, keepdims=True) |
| 75 | |
| 76 | return pd.DataFrame(probs, columns=PROB_COLS, index=x.index) |
| 77 | |
| 78 | |
| 79 | def export_coefficients(coef_mat: np.ndarray, feature_names): |
| 80 | """ |
| 81 | 导出更直观的参数表 |
| 82 | """ |
| 83 | rows = ["Intercept"] + list(feature_names) |
| 84 | coef_df = pd.DataFrame( |
| 85 | coef_mat, |
| 86 | index=rows, |
| 87 | columns=PROB_COLS |
| 88 | ) |
| 89 | return coef_df |
| 90 | |
| 91 | |
| 92 | def main(): |
| 93 | data = pd.read_csv("data_2300_skin.csv") |
| 94 | result = pd.read_csv("result_2300_skin.csv") |
| 95 | |
| 96 | x = preprocess_genotype(data) # 0,1,2 |
| 97 | |
| 98 | coef_mat, feature_names, fit_mask = fit_multinomial_from_probabilities( |
| 99 | x=x, |
| 100 | prob=result, |
| 101 | reference_index=4, # 以 PDarktoBlackSkin 为参考类 |
| 102 | ) |
| 103 | |
| 104 | pred = predict_multinomial(x, coef_mat, feature_names) |
| 105 | |
| 106 | coef_df = export_coefficients(coef_mat, feature_names) |
| 107 | |
| 108 | |
| 109 | |
| 110 | return x, coef_mat, feature_names |
| 111 | |
| 112 |