shuaizhou hat die Gist bearbeitet 3 days ago. Zu Änderung gehen
1 file changed, 95 insertions
multinomial_logistic.2.py(Datei erstellt)
| @@ -0,0 +1,95 @@ | |||
| 1 | + | import io | |
| 2 | + | from typing import List, Tuple | |
| 3 | + | ||
| 4 | + | import numpy as np | |
| 5 | + | import pandas as pd | |
| 6 | + | from sklearn.linear_model import LinearRegression | |
| 7 | + | ||
| 8 | + | ||
| 9 | + | def fit_exact_webtool_softmax( | |
| 10 | + | x: pd.DataFrame, y: pd.DataFrame | |
| 11 | + | ) -> Tuple[np.ndarray, np.ndarray, List[str]]: | |
| 12 | + | """ | |
| 13 | + | 用 data_2300_skin.csv 与 result_2300_skin.csv 成对数据, | |
| 14 | + | 反推出网站使用的 softmax 线性部分。 | |
| 15 | + | ||
| 16 | + | 思路: | |
| 17 | + | log(P_k / P_black) = b_k + X @ beta_k, k in [VeryPale, Pale, Intermediate, Dark] | |
| 18 | + | 以 Black 类作为基准类。 | |
| 19 | + | """ | |
| 20 | + | features = list(x.columns) | |
| 21 | + | ||
| 22 | + | # 只用完整样本拟合,以避免 NA 干扰 | |
| 23 | + | x_complete = x[~x.isna().any(axis=1)] | |
| 24 | + | y_complete = y.loc[x_complete.index] | |
| 25 | + | ||
| 26 | + | # 过滤掉无效概率 | |
| 27 | + | arr = y_complete[PROB_COLS].to_numpy() | |
| 28 | + | mask = np.isfinite(arr).all(axis=1) & (arr > 0).all(axis=1) | |
| 29 | + | x_complete = x_complete.loc[mask, features] | |
| 30 | + | y_complete = y_complete.loc[mask, PROB_COLS] | |
| 31 | + | ||
| 32 | + | # 构造 log-odds,相对最后一类(DarktoBlack / Black) | |
| 33 | + | target = np.log( | |
| 34 | + | y_complete.iloc[:, :4].to_numpy() | |
| 35 | + | / y_complete.iloc[:, 4].to_numpy()[:, None] | |
| 36 | + | ) | |
| 37 | + | ||
| 38 | + | coefs = [] | |
| 39 | + | intercepts = [] | |
| 40 | + | ||
| 41 | + | for i in range(4): | |
| 42 | + | lr = LinearRegression() | |
| 43 | + | lr.fit(x_complete.to_numpy(), target[:, i]) | |
| 44 | + | coefs.append(lr.coef_) | |
| 45 | + | intercepts.append(lr.intercept_) | |
| 46 | + | ||
| 47 | + | # 基准类系数全 0 | |
| 48 | + | coefs.append(np.zeros(len(features))) | |
| 49 | + | intercepts.append(0.0) | |
| 50 | + | ||
| 51 | + | coef_matrix = np.column_stack(coefs) # shape: [n_features, 5] | |
| 52 | + | intercepts = np.array(intercepts) # shape: [5] | |
| 53 | + | ||
| 54 | + | return coef_matrix, intercepts, features | |
| 55 | + | ||
| 56 | + | ||
| 57 | + | def predict_softmax( | |
| 58 | + | x: pd.DataFrame, | |
| 59 | + | coef_matrix: np.ndarray, | |
| 60 | + | intercepts: np.ndarray, | |
| 61 | + | features: List[str], | |
| 62 | + | fillna: float = 0.0, | |
| 63 | + | ) -> pd.DataFrame: | |
| 64 | + | """ | |
| 65 | + | 用反推得到的 softmax 参数预测 5 类概率。 | |
| 66 | + | """ | |
| 67 | + | arr = x[features].copy().fillna(fillna).to_numpy(dtype=float) | |
| 68 | + | z = arr @ coef_matrix + intercepts | |
| 69 | + | z = z - z.max(axis=1, keepdims=True) # 数值稳定 | |
| 70 | + | p = np.exp(z) | |
| 71 | + | p /= p.sum(axis=1, keepdims=True) | |
| 72 | + | return pd.DataFrame(p, columns=PROB_COLS, index=x.index) | |
| 73 | + | ||
| 74 | + | ||
| 75 | + | ||
| 76 | + | def main() -> None: | |
| 77 | + | # 读取你上传的两个文件 | |
| 78 | + | data = pd.read_csv("data_2300_skin.csv") | |
| 79 | + | result = pd.read_csv("result_2300_skin.csv") | |
| 80 | + | ||
| 81 | + | # 保存整理后的论文表 | |
| 82 | + | paper = load_paper_table() | |
| 83 | + | ||
| 84 | + | # 预处理输入位点 | |
| 85 | + | x = preprocess_input(data) | |
| 86 | + | ||
| 87 | + | # 用上传的输入/输出成对数据,拟合网站对应的 softmax 线性参数 | |
| 88 | + | coef_matrix, intercepts, features = fit_exact_webtool_softmax( | |
| 89 | + | x, result[PROB_COLS] | |
| 90 | + | ) | |
| 91 | + | ||
| 92 | + | # 预测 | |
| 93 | + | pred = predict_softmax( | |
| 94 | + | x, coef_matrix, intercepts, features, fillna=0.0 | |
| 95 | + | ) | |
Neuer
Älter