import io from typing import List, Tuple import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression def fit_exact_webtool_softmax( x: pd.DataFrame, y: pd.DataFrame ) -> Tuple[np.ndarray, np.ndarray, List[str]]: """ 用 data_2300_skin.csv 与 result_2300_skin.csv 成对数据, 反推出网站使用的 softmax 线性部分。 思路: log(P_k / P_black) = b_k + X @ beta_k, k in [VeryPale, Pale, Intermediate, Dark] 以 Black 类作为基准类。 """ features = list(x.columns) # 只用完整样本拟合,以避免 NA 干扰 x_complete = x[~x.isna().any(axis=1)] y_complete = y.loc[x_complete.index] # 过滤掉无效概率 arr = y_complete[PROB_COLS].to_numpy() mask = np.isfinite(arr).all(axis=1) & (arr > 0).all(axis=1) x_complete = x_complete.loc[mask, features] y_complete = y_complete.loc[mask, PROB_COLS] # 构造 log-odds,相对最后一类(DarktoBlack / Black) target = np.log( y_complete.iloc[:, :4].to_numpy() / y_complete.iloc[:, 4].to_numpy()[:, None] ) coefs = [] intercepts = [] for i in range(4): lr = LinearRegression() lr.fit(x_complete.to_numpy(), target[:, i]) coefs.append(lr.coef_) intercepts.append(lr.intercept_) # 基准类系数全 0 coefs.append(np.zeros(len(features))) intercepts.append(0.0) coef_matrix = np.column_stack(coefs) # shape: [n_features, 5] intercepts = np.array(intercepts) # shape: [5] return coef_matrix, intercepts, features def predict_softmax( x: pd.DataFrame, coef_matrix: np.ndarray, intercepts: np.ndarray, features: List[str], fillna: float = 0.0, ) -> pd.DataFrame: """ 用反推得到的 softmax 参数预测 5 类概率。 """ arr = x[features].copy().fillna(fillna).to_numpy(dtype=float) z = arr @ coef_matrix + intercepts z = z - z.max(axis=1, keepdims=True) # 数值稳定 p = np.exp(z) p /= p.sum(axis=1, keepdims=True) return pd.DataFrame(p, columns=PROB_COLS, index=x.index) def main() -> None: # 读取你上传的两个文件 data = pd.read_csv("data_2300_skin.csv") result = pd.read_csv("result_2300_skin.csv") # 保存整理后的论文表 paper = load_paper_table() # 预处理输入位点 x = preprocess_input(data) # 用上传的输入/输出成对数据,拟合网站对应的 softmax 线性参数 coef_matrix, intercepts, features = fit_exact_webtool_softmax( x, result[PROB_COLS] ) # 预测 pred = predict_softmax( x, coef_matrix, intercepts, features, fillna=0.0 )