Zuletzt aktiv 3 days ago

shuaizhou hat die Gist bearbeitet 3 days ago. Zu Änderung gehen

1 file changed, 95 insertions

multinomial_logistic.2.py(Datei erstellt)

@@ -0,0 +1,95 @@
1 + import io
2 + from typing import List, Tuple
3 +
4 + import numpy as np
5 + import pandas as pd
6 + from sklearn.linear_model import LinearRegression
7 +
8 +
9 + def fit_exact_webtool_softmax(
10 + x: pd.DataFrame, y: pd.DataFrame
11 + ) -> Tuple[np.ndarray, np.ndarray, List[str]]:
12 + """
13 + 用 data_2300_skin.csv 与 result_2300_skin.csv 成对数据,
14 + 反推出网站使用的 softmax 线性部分。
15 +
16 + 思路:
17 + log(P_k / P_black) = b_k + X @ beta_k, k in [VeryPale, Pale, Intermediate, Dark]
18 + 以 Black 类作为基准类。
19 + """
20 + features = list(x.columns)
21 +
22 + # 只用完整样本拟合,以避免 NA 干扰
23 + x_complete = x[~x.isna().any(axis=1)]
24 + y_complete = y.loc[x_complete.index]
25 +
26 + # 过滤掉无效概率
27 + arr = y_complete[PROB_COLS].to_numpy()
28 + mask = np.isfinite(arr).all(axis=1) & (arr > 0).all(axis=1)
29 + x_complete = x_complete.loc[mask, features]
30 + y_complete = y_complete.loc[mask, PROB_COLS]
31 +
32 + # 构造 log-odds,相对最后一类(DarktoBlack / Black)
33 + target = np.log(
34 + y_complete.iloc[:, :4].to_numpy()
35 + / y_complete.iloc[:, 4].to_numpy()[:, None]
36 + )
37 +
38 + coefs = []
39 + intercepts = []
40 +
41 + for i in range(4):
42 + lr = LinearRegression()
43 + lr.fit(x_complete.to_numpy(), target[:, i])
44 + coefs.append(lr.coef_)
45 + intercepts.append(lr.intercept_)
46 +
47 + # 基准类系数全 0
48 + coefs.append(np.zeros(len(features)))
49 + intercepts.append(0.0)
50 +
51 + coef_matrix = np.column_stack(coefs) # shape: [n_features, 5]
52 + intercepts = np.array(intercepts) # shape: [5]
53 +
54 + return coef_matrix, intercepts, features
55 +
56 +
57 + def predict_softmax(
58 + x: pd.DataFrame,
59 + coef_matrix: np.ndarray,
60 + intercepts: np.ndarray,
61 + features: List[str],
62 + fillna: float = 0.0,
63 + ) -> pd.DataFrame:
64 + """
65 + 用反推得到的 softmax 参数预测 5 类概率。
66 + """
67 + arr = x[features].copy().fillna(fillna).to_numpy(dtype=float)
68 + z = arr @ coef_matrix + intercepts
69 + z = z - z.max(axis=1, keepdims=True) # 数值稳定
70 + p = np.exp(z)
71 + p /= p.sum(axis=1, keepdims=True)
72 + return pd.DataFrame(p, columns=PROB_COLS, index=x.index)
73 +
74 +
75 +
76 + def main() -> None:
77 + # 读取你上传的两个文件
78 + data = pd.read_csv("data_2300_skin.csv")
79 + result = pd.read_csv("result_2300_skin.csv")
80 +
81 + # 保存整理后的论文表
82 + paper = load_paper_table()
83 +
84 + # 预处理输入位点
85 + x = preprocess_input(data)
86 +
87 + # 用上传的输入/输出成对数据,拟合网站对应的 softmax 线性参数
88 + coef_matrix, intercepts, features = fit_exact_webtool_softmax(
89 + x, result[PROB_COLS]
90 + )
91 +
92 + # 预测
93 + pred = predict_softmax(
94 + x, coef_matrix, intercepts, features, fillna=0.0
95 + )
Neuer Älter