Last active 3 days ago

shuaizhou revised this gist 3 days ago. Go to revision

1 file changed, 111 insertions

multinomial_logistic.1.py(file created)

@@ -0,0 +1,111 @@
1 + import numpy as np
2 + import pandas as pd
3 +
4 + def build_design_matrix(x: pd.DataFrame) -> np.ndarray:
5 + """
6 + 在最前面加一列 1 作为截距项
7 + """
8 + X = x.to_numpy(dtype=float)
9 + intercept = np.ones((X.shape[0], 1), dtype=float)
10 + return np.hstack([intercept, X])
11 +
12 +
13 + def fit_multinomial_from_probabilities(
14 + x: pd.DataFrame,
15 + prob: pd.DataFrame,
16 + reference_index: int = 4,
17 + ):
18 + """
19 + 利用已知类别概率 p,拟合 multinomial logistic:
20 + log(p_k / p_ref) = alpha_k + x^T beta_k
21 +
22 + 返回:
23 + coef_mat: shape = (p+1, K)
24 + 第一行是截距
25 + reference 类对应整列为 0
26 + feature_names
27 + """
28 + feature_names = list(x.columns)
29 +
30 + # 只保留完整输入 + 正概率样本
31 + mask_x = ~x.isna().any(axis=1)
32 + mask_p = np.isfinite(prob[PROB_COLS]).all(axis=1) & (prob[PROB_COLS] > 0).all(axis=1)
33 + mask = mask_x & mask_p
34 +
35 + x_fit = x.loc[mask, feature_names]
36 + p_fit = prob.loc[mask, PROB_COLS]
37 +
38 + X = build_design_matrix(x_fit) # n x (p+1)
39 + P = p_fit.to_numpy(dtype=float) # n x K
40 +
41 + n, d = X.shape
42 + K = P.shape[1]
43 +
44 + coef_mat = np.zeros((d, K), dtype=float)
45 +
46 + pref = P[:, [reference_index]]
47 +
48 + # 对非参考类逐个拟合 log(p_k / p_ref)
49 + for k in range(K):
50 + if k == reference_index:
51 + continue
52 +
53 + z = np.log(P[:, k:k+1] / pref).ravel() # n,
54 + theta, residuals, rank, s = np.linalg.lstsq(X, z, rcond=None)
55 + coef_mat[:, k] = theta
56 +
57 + return coef_mat, feature_names, mask
58 +
59 +
60 + def predict_multinomial(
61 + x: pd.DataFrame,
62 + coef_mat: np.ndarray,
63 + feature_names,
64 + ):
65 + """
66 + 根据拟合得到的系数矩阵预测 softmax 概率
67 + """
68 + X = build_design_matrix(x[feature_names].fillna(0.0))
69 + logits = X @ coef_mat # n x K
70 +
71 + # 数值稳定
72 + logits = logits - logits.max(axis=1, keepdims=True)
73 + exp_logits = np.exp(logits)
74 + probs = exp_logits / exp_logits.sum(axis=1, keepdims=True)
75 +
76 + return pd.DataFrame(probs, columns=PROB_COLS, index=x.index)
77 +
78 +
79 + def export_coefficients(coef_mat: np.ndarray, feature_names):
80 + """
81 + 导出更直观的参数表
82 + """
83 + rows = ["Intercept"] + list(feature_names)
84 + coef_df = pd.DataFrame(
85 + coef_mat,
86 + index=rows,
87 + columns=PROB_COLS
88 + )
89 + return coef_df
90 +
91 +
92 + def main():
93 + data = pd.read_csv("data_2300_skin.csv")
94 + result = pd.read_csv("result_2300_skin.csv")
95 +
96 + x = preprocess_genotype(data) # 0,1,2
97 +
98 + coef_mat, feature_names, fit_mask = fit_multinomial_from_probabilities(
99 + x=x,
100 + prob=result,
101 + reference_index=4, # 以 PDarktoBlackSkin 为参考类
102 + )
103 +
104 + pred = predict_multinomial(x, coef_mat, feature_names)
105 +
106 + coef_df = export_coefficients(coef_mat, feature_names)
107 +
108 +
109 +
110 + return x, coef_mat, feature_names
111 +
Newer Older