Ostatnio aktywny 3 days ago

multinomial_logistic.2.py Surowy
1import io
2from typing import List, Tuple
3
4import numpy as np
5import pandas as pd
6from sklearn.linear_model import LinearRegression
7
8
9def fit_exact_webtool_softmax(
10 x: pd.DataFrame, y: pd.DataFrame
11) -> Tuple[np.ndarray, np.ndarray, List[str]]:
12 """
13 用 data_2300_skin.csv 与 result_2300_skin.csv 成对数据,
14 反推出网站使用的 softmax 线性部分。
15
16 思路:
17 log(P_k / P_black) = b_k + X @ beta_k, k in [VeryPale, Pale, Intermediate, Dark]
18 以 Black 类作为基准类。
19 """
20 features = list(x.columns)
21
22 # 只用完整样本拟合,以避免 NA 干扰
23 x_complete = x[~x.isna().any(axis=1)]
24 y_complete = y.loc[x_complete.index]
25
26 # 过滤掉无效概率
27 arr = y_complete[PROB_COLS].to_numpy()
28 mask = np.isfinite(arr).all(axis=1) & (arr > 0).all(axis=1)
29 x_complete = x_complete.loc[mask, features]
30 y_complete = y_complete.loc[mask, PROB_COLS]
31
32 # 构造 log-odds,相对最后一类(DarktoBlack / Black)
33 target = np.log(
34 y_complete.iloc[:, :4].to_numpy()
35 / y_complete.iloc[:, 4].to_numpy()[:, None]
36 )
37
38 coefs = []
39 intercepts = []
40
41 for i in range(4):
42 lr = LinearRegression()
43 lr.fit(x_complete.to_numpy(), target[:, i])
44 coefs.append(lr.coef_)
45 intercepts.append(lr.intercept_)
46
47 # 基准类系数全 0
48 coefs.append(np.zeros(len(features)))
49 intercepts.append(0.0)
50
51 coef_matrix = np.column_stack(coefs) # shape: [n_features, 5]
52 intercepts = np.array(intercepts) # shape: [5]
53
54 return coef_matrix, intercepts, features
55
56
57def predict_softmax(
58 x: pd.DataFrame,
59 coef_matrix: np.ndarray,
60 intercepts: np.ndarray,
61 features: List[str],
62 fillna: float = 0.0,
63) -> pd.DataFrame:
64 """
65 用反推得到的 softmax 参数预测 5 类概率。
66 """
67 arr = x[features].copy().fillna(fillna).to_numpy(dtype=float)
68 z = arr @ coef_matrix + intercepts
69 z = z - z.max(axis=1, keepdims=True) # 数值稳定
70 p = np.exp(z)
71 p /= p.sum(axis=1, keepdims=True)
72 return pd.DataFrame(p, columns=PROB_COLS, index=x.index)
73
74
75
76def main() -> None:
77 # 读取你上传的两个文件
78 data = pd.read_csv("data_2300_skin.csv")
79 result = pd.read_csv("result_2300_skin.csv")
80
81 # 保存整理后的论文表
82 paper = load_paper_table()
83
84 # 预处理输入位点
85 x = preprocess_input(data)
86
87 # 用上传的输入/输出成对数据,拟合网站对应的 softmax 线性参数
88 coef_matrix, intercepts, features = fit_exact_webtool_softmax(
89 x, result[PROB_COLS]
90 )
91
92 # 预测
93 pred = predict_softmax(
94 x, coef_matrix, intercepts, features, fillna=0.0
95 )
96