Última actividad 3 days ago

multinomial_logistic.1.py Sin formato
1import numpy as np
2import pandas as pd
3
4def build_design_matrix(x: pd.DataFrame) -> np.ndarray:
5 """
6 在最前面加一列 1 作为截距项
7 """
8 X = x.to_numpy(dtype=float)
9 intercept = np.ones((X.shape[0], 1), dtype=float)
10 return np.hstack([intercept, X])
11
12
13def fit_multinomial_from_probabilities(
14 x: pd.DataFrame,
15 prob: pd.DataFrame,
16 reference_index: int = 4,
17):
18 """
19 利用已知类别概率 p,拟合 multinomial logistic:
20 log(p_k / p_ref) = alpha_k + x^T beta_k
21
22 返回:
23 coef_mat: shape = (p+1, K)
24 第一行是截距
25 reference 类对应整列为 0
26 feature_names
27 """
28 feature_names = list(x.columns)
29
30 # 只保留完整输入 + 正概率样本
31 mask_x = ~x.isna().any(axis=1)
32 mask_p = np.isfinite(prob[PROB_COLS]).all(axis=1) & (prob[PROB_COLS] > 0).all(axis=1)
33 mask = mask_x & mask_p
34
35 x_fit = x.loc[mask, feature_names]
36 p_fit = prob.loc[mask, PROB_COLS]
37
38 X = build_design_matrix(x_fit) # n x (p+1)
39 P = p_fit.to_numpy(dtype=float) # n x K
40
41 n, d = X.shape
42 K = P.shape[1]
43
44 coef_mat = np.zeros((d, K), dtype=float)
45
46 pref = P[:, [reference_index]]
47
48 # 对非参考类逐个拟合 log(p_k / p_ref)
49 for k in range(K):
50 if k == reference_index:
51 continue
52
53 z = np.log(P[:, k:k+1] / pref).ravel() # n,
54 theta, residuals, rank, s = np.linalg.lstsq(X, z, rcond=None)
55 coef_mat[:, k] = theta
56
57 return coef_mat, feature_names, mask
58
59
60def predict_multinomial(
61 x: pd.DataFrame,
62 coef_mat: np.ndarray,
63 feature_names,
64):
65 """
66 根据拟合得到的系数矩阵预测 softmax 概率
67 """
68 X = build_design_matrix(x[feature_names].fillna(0.0))
69 logits = X @ coef_mat # n x K
70
71 # 数值稳定
72 logits = logits - logits.max(axis=1, keepdims=True)
73 exp_logits = np.exp(logits)
74 probs = exp_logits / exp_logits.sum(axis=1, keepdims=True)
75
76 return pd.DataFrame(probs, columns=PROB_COLS, index=x.index)
77
78
79def export_coefficients(coef_mat: np.ndarray, feature_names):
80 """
81 导出更直观的参数表
82 """
83 rows = ["Intercept"] + list(feature_names)
84 coef_df = pd.DataFrame(
85 coef_mat,
86 index=rows,
87 columns=PROB_COLS
88 )
89 return coef_df
90
91
92def main():
93 data = pd.read_csv("data_2300_skin.csv")
94 result = pd.read_csv("result_2300_skin.csv")
95
96 x = preprocess_genotype(data) # 0,1,2
97
98 coef_mat, feature_names, fit_mask = fit_multinomial_from_probabilities(
99 x=x,
100 prob=result,
101 reference_index=4, # 以 PDarktoBlackSkin 为参考类
102 )
103
104 pred = predict_multinomial(x, coef_mat, feature_names)
105
106 coef_df = export_coefficients(coef_mat, feature_names)
107
108
109
110 return x, coef_mat, feature_names
111
112