Revision of multinomial_logistic.1.py

shuaizhou revised this gist 3 days ago. Go to revision

1 file changed, 111 insertions

multinomial_logistic.1.py(file created)

		@@ -0,0 +1,111 @@
1	+	import numpy as np
2	+	import pandas as pd
3	+
4	+	def build_design_matrix(x: pd.DataFrame) -> np.ndarray:
5	+	"""
6	+	在最前面加一列 1 作为截距项
7	+	"""
8	+	X = x.to_numpy(dtype=float)
9	+	intercept = np.ones((X.shape[0], 1), dtype=float)
10	+	return np.hstack([intercept, X])
11	+
12	+
13	+	def fit_multinomial_from_probabilities(
14	+	x: pd.DataFrame,
15	+	prob: pd.DataFrame,
16	+	reference_index: int = 4,
17	+	):
18	+	"""
19	+	利用已知类别概率 p，拟合 multinomial logistic:
20	+	log(p_k / p_ref) = alpha_k + x^T beta_k
21	+
22	+	返回:
23	+	coef_mat: shape = (p+1, K)
24	+	第一行是截距
25	+	reference 类对应整列为 0
26	+	feature_names
27	+	"""
28	+	feature_names = list(x.columns)
29	+
30	+	# 只保留完整输入 + 正概率样本
31	+	mask_x = ~x.isna().any(axis=1)
32	+	mask_p = np.isfinite(prob[PROB_COLS]).all(axis=1) & (prob[PROB_COLS] > 0).all(axis=1)
33	+	mask = mask_x & mask_p
34	+
35	+	x_fit = x.loc[mask, feature_names]
36	+	p_fit = prob.loc[mask, PROB_COLS]
37	+
38	+	X = build_design_matrix(x_fit) # n x (p+1)
39	+	P = p_fit.to_numpy(dtype=float) # n x K
40	+
41	+	n, d = X.shape
42	+	K = P.shape[1]
43	+
44	+	coef_mat = np.zeros((d, K), dtype=float)
45	+
46	+	pref = P[:, [reference_index]]
47	+
48	+	# 对非参考类逐个拟合 log(p_k / p_ref)
49	+	for k in range(K):
50	+	if k == reference_index:
51	+	continue
52	+
53	+	z = np.log(P[:, k:k+1] / pref).ravel() # n,
54	+	theta, residuals, rank, s = np.linalg.lstsq(X, z, rcond=None)
55	+	coef_mat[:, k] = theta
56	+
57	+	return coef_mat, feature_names, mask
58	+
59	+
60	+	def predict_multinomial(
61	+	x: pd.DataFrame,
62	+	coef_mat: np.ndarray,
63	+	feature_names,
64	+	):
65	+	"""
66	+	根据拟合得到的系数矩阵预测 softmax 概率
67	+	"""
68	+	X = build_design_matrix(x[feature_names].fillna(0.0))
69	+	logits = X @ coef_mat # n x K
70	+
71	+	# 数值稳定
72	+	logits = logits - logits.max(axis=1, keepdims=True)
73	+	exp_logits = np.exp(logits)
74	+	probs = exp_logits / exp_logits.sum(axis=1, keepdims=True)
75	+
76	+	return pd.DataFrame(probs, columns=PROB_COLS, index=x.index)
77	+
78	+
79	+	def export_coefficients(coef_mat: np.ndarray, feature_names):
80	+	"""
81	+	导出更直观的参数表
82	+	"""
83	+	rows = ["Intercept"] + list(feature_names)
84	+	coef_df = pd.DataFrame(
85	+	coef_mat,
86	+	index=rows,
87	+	columns=PROB_COLS
88	+	)
89	+	return coef_df
90	+
91	+
92	+	def main():
93	+	data = pd.read_csv("data_2300_skin.csv")
94	+	result = pd.read_csv("result_2300_skin.csv")
95	+
96	+	x = preprocess_genotype(data) # 0,1,2
97	+
98	+	coef_mat, feature_names, fit_mask = fit_multinomial_from_probabilities(
99	+	x=x,
100	+	prob=result,
101	+	reference_index=4, # 以 PDarktoBlackSkin 为参考类
102	+	)
103	+
104	+	pred = predict_multinomial(x, coef_mat, feature_names)
105	+
106	+	coef_df = export_coefficients(coef_mat, feature_names)
107	+
108	+
109	+
110	+	return x, coef_mat, feature_names
111	+

Newer Older