multinomial_logistic.2.py
· 2.6 KiB · Python
Исходник
import io
from typing import List, Tuple
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
def fit_exact_webtool_softmax(
x: pd.DataFrame, y: pd.DataFrame
) -> Tuple[np.ndarray, np.ndarray, List[str]]:
"""
用 data_2300_skin.csv 与 result_2300_skin.csv 成对数据,
反推出网站使用的 softmax 线性部分。
思路:
log(P_k / P_black) = b_k + X @ beta_k, k in [VeryPale, Pale, Intermediate, Dark]
以 Black 类作为基准类。
"""
features = list(x.columns)
# 只用完整样本拟合,以避免 NA 干扰
x_complete = x[~x.isna().any(axis=1)]
y_complete = y.loc[x_complete.index]
# 过滤掉无效概率
arr = y_complete[PROB_COLS].to_numpy()
mask = np.isfinite(arr).all(axis=1) & (arr > 0).all(axis=1)
x_complete = x_complete.loc[mask, features]
y_complete = y_complete.loc[mask, PROB_COLS]
# 构造 log-odds,相对最后一类(DarktoBlack / Black)
target = np.log(
y_complete.iloc[:, :4].to_numpy()
/ y_complete.iloc[:, 4].to_numpy()[:, None]
)
coefs = []
intercepts = []
for i in range(4):
lr = LinearRegression()
lr.fit(x_complete.to_numpy(), target[:, i])
coefs.append(lr.coef_)
intercepts.append(lr.intercept_)
# 基准类系数全 0
coefs.append(np.zeros(len(features)))
intercepts.append(0.0)
coef_matrix = np.column_stack(coefs) # shape: [n_features, 5]
intercepts = np.array(intercepts) # shape: [5]
return coef_matrix, intercepts, features
def predict_softmax(
x: pd.DataFrame,
coef_matrix: np.ndarray,
intercepts: np.ndarray,
features: List[str],
fillna: float = 0.0,
) -> pd.DataFrame:
"""
用反推得到的 softmax 参数预测 5 类概率。
"""
arr = x[features].copy().fillna(fillna).to_numpy(dtype=float)
z = arr @ coef_matrix + intercepts
z = z - z.max(axis=1, keepdims=True) # 数值稳定
p = np.exp(z)
p /= p.sum(axis=1, keepdims=True)
return pd.DataFrame(p, columns=PROB_COLS, index=x.index)
def main() -> None:
# 读取你上传的两个文件
data = pd.read_csv("data_2300_skin.csv")
result = pd.read_csv("result_2300_skin.csv")
# 保存整理后的论文表
paper = load_paper_table()
# 预处理输入位点
x = preprocess_input(data)
# 用上传的输入/输出成对数据,拟合网站对应的 softmax 线性参数
coef_matrix, intercepts, features = fit_exact_webtool_softmax(
x, result[PROB_COLS]
)
# 预测
pred = predict_softmax(
x, coef_matrix, intercepts, features, fillna=0.0
)
| 1 | import io |
| 2 | from typing import List, Tuple |
| 3 | |
| 4 | import numpy as np |
| 5 | import pandas as pd |
| 6 | from sklearn.linear_model import LinearRegression |
| 7 | |
| 8 | |
| 9 | def fit_exact_webtool_softmax( |
| 10 | x: pd.DataFrame, y: pd.DataFrame |
| 11 | ) -> Tuple[np.ndarray, np.ndarray, List[str]]: |
| 12 | """ |
| 13 | 用 data_2300_skin.csv 与 result_2300_skin.csv 成对数据, |
| 14 | 反推出网站使用的 softmax 线性部分。 |
| 15 | |
| 16 | 思路: |
| 17 | log(P_k / P_black) = b_k + X @ beta_k, k in [VeryPale, Pale, Intermediate, Dark] |
| 18 | 以 Black 类作为基准类。 |
| 19 | """ |
| 20 | features = list(x.columns) |
| 21 | |
| 22 | # 只用完整样本拟合,以避免 NA 干扰 |
| 23 | x_complete = x[~x.isna().any(axis=1)] |
| 24 | y_complete = y.loc[x_complete.index] |
| 25 | |
| 26 | # 过滤掉无效概率 |
| 27 | arr = y_complete[PROB_COLS].to_numpy() |
| 28 | mask = np.isfinite(arr).all(axis=1) & (arr > 0).all(axis=1) |
| 29 | x_complete = x_complete.loc[mask, features] |
| 30 | y_complete = y_complete.loc[mask, PROB_COLS] |
| 31 | |
| 32 | # 构造 log-odds,相对最后一类(DarktoBlack / Black) |
| 33 | target = np.log( |
| 34 | y_complete.iloc[:, :4].to_numpy() |
| 35 | / y_complete.iloc[:, 4].to_numpy()[:, None] |
| 36 | ) |
| 37 | |
| 38 | coefs = [] |
| 39 | intercepts = [] |
| 40 | |
| 41 | for i in range(4): |
| 42 | lr = LinearRegression() |
| 43 | lr.fit(x_complete.to_numpy(), target[:, i]) |
| 44 | coefs.append(lr.coef_) |
| 45 | intercepts.append(lr.intercept_) |
| 46 | |
| 47 | # 基准类系数全 0 |
| 48 | coefs.append(np.zeros(len(features))) |
| 49 | intercepts.append(0.0) |
| 50 | |
| 51 | coef_matrix = np.column_stack(coefs) # shape: [n_features, 5] |
| 52 | intercepts = np.array(intercepts) # shape: [5] |
| 53 | |
| 54 | return coef_matrix, intercepts, features |
| 55 | |
| 56 | |
| 57 | def predict_softmax( |
| 58 | x: pd.DataFrame, |
| 59 | coef_matrix: np.ndarray, |
| 60 | intercepts: np.ndarray, |
| 61 | features: List[str], |
| 62 | fillna: float = 0.0, |
| 63 | ) -> pd.DataFrame: |
| 64 | """ |
| 65 | 用反推得到的 softmax 参数预测 5 类概率。 |
| 66 | """ |
| 67 | arr = x[features].copy().fillna(fillna).to_numpy(dtype=float) |
| 68 | z = arr @ coef_matrix + intercepts |
| 69 | z = z - z.max(axis=1, keepdims=True) # 数值稳定 |
| 70 | p = np.exp(z) |
| 71 | p /= p.sum(axis=1, keepdims=True) |
| 72 | return pd.DataFrame(p, columns=PROB_COLS, index=x.index) |
| 73 | |
| 74 | |
| 75 | |
| 76 | def main() -> None: |
| 77 | # 读取你上传的两个文件 |
| 78 | data = pd.read_csv("data_2300_skin.csv") |
| 79 | result = pd.read_csv("result_2300_skin.csv") |
| 80 | |
| 81 | # 保存整理后的论文表 |
| 82 | paper = load_paper_table() |
| 83 | |
| 84 | # 预处理输入位点 |
| 85 | x = preprocess_input(data) |
| 86 | |
| 87 | # 用上传的输入/输出成对数据,拟合网站对应的 softmax 线性参数 |
| 88 | coef_matrix, intercepts, features = fit_exact_webtool_softmax( |
| 89 | x, result[PROB_COLS] |
| 90 | ) |
| 91 | |
| 92 | # 预测 |
| 93 | pred = predict_softmax( |
| 94 | x, coef_matrix, intercepts, features, fillna=0.0 |
| 95 | ) |
| 96 |