shuaizhou revised this gist 3 days ago. Go to revision
1 file changed, 111 insertions
multinomial_logistic.1.py(file created)
| @@ -0,0 +1,111 @@ | |||
| 1 | + | import numpy as np | |
| 2 | + | import pandas as pd | |
| 3 | + | ||
| 4 | + | def build_design_matrix(x: pd.DataFrame) -> np.ndarray: | |
| 5 | + | """ | |
| 6 | + | 在最前面加一列 1 作为截距项 | |
| 7 | + | """ | |
| 8 | + | X = x.to_numpy(dtype=float) | |
| 9 | + | intercept = np.ones((X.shape[0], 1), dtype=float) | |
| 10 | + | return np.hstack([intercept, X]) | |
| 11 | + | ||
| 12 | + | ||
| 13 | + | def fit_multinomial_from_probabilities( | |
| 14 | + | x: pd.DataFrame, | |
| 15 | + | prob: pd.DataFrame, | |
| 16 | + | reference_index: int = 4, | |
| 17 | + | ): | |
| 18 | + | """ | |
| 19 | + | 利用已知类别概率 p,拟合 multinomial logistic: | |
| 20 | + | log(p_k / p_ref) = alpha_k + x^T beta_k | |
| 21 | + | ||
| 22 | + | 返回: | |
| 23 | + | coef_mat: shape = (p+1, K) | |
| 24 | + | 第一行是截距 | |
| 25 | + | reference 类对应整列为 0 | |
| 26 | + | feature_names | |
| 27 | + | """ | |
| 28 | + | feature_names = list(x.columns) | |
| 29 | + | ||
| 30 | + | # 只保留完整输入 + 正概率样本 | |
| 31 | + | mask_x = ~x.isna().any(axis=1) | |
| 32 | + | mask_p = np.isfinite(prob[PROB_COLS]).all(axis=1) & (prob[PROB_COLS] > 0).all(axis=1) | |
| 33 | + | mask = mask_x & mask_p | |
| 34 | + | ||
| 35 | + | x_fit = x.loc[mask, feature_names] | |
| 36 | + | p_fit = prob.loc[mask, PROB_COLS] | |
| 37 | + | ||
| 38 | + | X = build_design_matrix(x_fit) # n x (p+1) | |
| 39 | + | P = p_fit.to_numpy(dtype=float) # n x K | |
| 40 | + | ||
| 41 | + | n, d = X.shape | |
| 42 | + | K = P.shape[1] | |
| 43 | + | ||
| 44 | + | coef_mat = np.zeros((d, K), dtype=float) | |
| 45 | + | ||
| 46 | + | pref = P[:, [reference_index]] | |
| 47 | + | ||
| 48 | + | # 对非参考类逐个拟合 log(p_k / p_ref) | |
| 49 | + | for k in range(K): | |
| 50 | + | if k == reference_index: | |
| 51 | + | continue | |
| 52 | + | ||
| 53 | + | z = np.log(P[:, k:k+1] / pref).ravel() # n, | |
| 54 | + | theta, residuals, rank, s = np.linalg.lstsq(X, z, rcond=None) | |
| 55 | + | coef_mat[:, k] = theta | |
| 56 | + | ||
| 57 | + | return coef_mat, feature_names, mask | |
| 58 | + | ||
| 59 | + | ||
| 60 | + | def predict_multinomial( | |
| 61 | + | x: pd.DataFrame, | |
| 62 | + | coef_mat: np.ndarray, | |
| 63 | + | feature_names, | |
| 64 | + | ): | |
| 65 | + | """ | |
| 66 | + | 根据拟合得到的系数矩阵预测 softmax 概率 | |
| 67 | + | """ | |
| 68 | + | X = build_design_matrix(x[feature_names].fillna(0.0)) | |
| 69 | + | logits = X @ coef_mat # n x K | |
| 70 | + | ||
| 71 | + | # 数值稳定 | |
| 72 | + | logits = logits - logits.max(axis=1, keepdims=True) | |
| 73 | + | exp_logits = np.exp(logits) | |
| 74 | + | probs = exp_logits / exp_logits.sum(axis=1, keepdims=True) | |
| 75 | + | ||
| 76 | + | return pd.DataFrame(probs, columns=PROB_COLS, index=x.index) | |
| 77 | + | ||
| 78 | + | ||
| 79 | + | def export_coefficients(coef_mat: np.ndarray, feature_names): | |
| 80 | + | """ | |
| 81 | + | 导出更直观的参数表 | |
| 82 | + | """ | |
| 83 | + | rows = ["Intercept"] + list(feature_names) | |
| 84 | + | coef_df = pd.DataFrame( | |
| 85 | + | coef_mat, | |
| 86 | + | index=rows, | |
| 87 | + | columns=PROB_COLS | |
| 88 | + | ) | |
| 89 | + | return coef_df | |
| 90 | + | ||
| 91 | + | ||
| 92 | + | def main(): | |
| 93 | + | data = pd.read_csv("data_2300_skin.csv") | |
| 94 | + | result = pd.read_csv("result_2300_skin.csv") | |
| 95 | + | ||
| 96 | + | x = preprocess_genotype(data) # 0,1,2 | |
| 97 | + | ||
| 98 | + | coef_mat, feature_names, fit_mask = fit_multinomial_from_probabilities( | |
| 99 | + | x=x, | |
| 100 | + | prob=result, | |
| 101 | + | reference_index=4, # 以 PDarktoBlackSkin 为参考类 | |
| 102 | + | ) | |
| 103 | + | ||
| 104 | + | pred = predict_multinomial(x, coef_mat, feature_names) | |
| 105 | + | ||
| 106 | + | coef_df = export_coefficients(coef_mat, feature_names) | |
| 107 | + | ||
| 108 | + | ||
| 109 | + | ||
| 110 | + | return x, coef_mat, feature_names | |
| 111 | + | ||
Newer
Older