import numpy as np import pandas as pd def build_design_matrix(x: pd.DataFrame) -> np.ndarray: """ 在最前面加一列 1 作为截距项 """ X = x.to_numpy(dtype=float) intercept = np.ones((X.shape[0], 1), dtype=float) return np.hstack([intercept, X]) def fit_multinomial_from_probabilities( x: pd.DataFrame, prob: pd.DataFrame, reference_index: int = 4, ): """ 利用已知类别概率 p,拟合 multinomial logistic: log(p_k / p_ref) = alpha_k + x^T beta_k 返回: coef_mat: shape = (p+1, K) 第一行是截距 reference 类对应整列为 0 feature_names """ feature_names = list(x.columns) # 只保留完整输入 + 正概率样本 mask_x = ~x.isna().any(axis=1) mask_p = np.isfinite(prob[PROB_COLS]).all(axis=1) & (prob[PROB_COLS] > 0).all(axis=1) mask = mask_x & mask_p x_fit = x.loc[mask, feature_names] p_fit = prob.loc[mask, PROB_COLS] X = build_design_matrix(x_fit) # n x (p+1) P = p_fit.to_numpy(dtype=float) # n x K n, d = X.shape K = P.shape[1] coef_mat = np.zeros((d, K), dtype=float) pref = P[:, [reference_index]] # 对非参考类逐个拟合 log(p_k / p_ref) for k in range(K): if k == reference_index: continue z = np.log(P[:, k:k+1] / pref).ravel() # n, theta, residuals, rank, s = np.linalg.lstsq(X, z, rcond=None) coef_mat[:, k] = theta return coef_mat, feature_names, mask def predict_multinomial( x: pd.DataFrame, coef_mat: np.ndarray, feature_names, ): """ 根据拟合得到的系数矩阵预测 softmax 概率 """ X = build_design_matrix(x[feature_names].fillna(0.0)) logits = X @ coef_mat # n x K # 数值稳定 logits = logits - logits.max(axis=1, keepdims=True) exp_logits = np.exp(logits) probs = exp_logits / exp_logits.sum(axis=1, keepdims=True) return pd.DataFrame(probs, columns=PROB_COLS, index=x.index) def export_coefficients(coef_mat: np.ndarray, feature_names): """ 导出更直观的参数表 """ rows = ["Intercept"] + list(feature_names) coef_df = pd.DataFrame( coef_mat, index=rows, columns=PROB_COLS ) return coef_df def main(): data = pd.read_csv("data_2300_skin.csv") result = pd.read_csv("result_2300_skin.csv") x = preprocess_genotype(data) # 0,1,2 coef_mat, feature_names, fit_mask = fit_multinomial_from_probabilities( x=x, prob=result, reference_index=4, # 以 PDarktoBlackSkin 为参考类 ) pred = predict_multinomial(x, coef_mat, feature_names) coef_df = export_coefficients(coef_mat, feature_names) return x, coef_mat, feature_names