Last active 1 month ago

shuaizhou revised this gist 1 month ago. Go to revision

1 file changed, 1 insertion, 13 deletions

parser_vcf.py

@@ -9,21 +9,9 @@ class QC:
9 9
10 10 def __init__(
11 11 self,
12 - uid: str,
13 - library_num: int,
14 - output_qc: str,
15 - input_vcf: str,
16 - basic_reads: str,
17 - bamstats: str,
18 - depth: int
12 + input_vcf: str
19 13 ):
20 - self.uid = uid
21 - self.library_num = library_num
22 - self.output_qc = output_qc
23 14 self.input_vcf = input_vcf
24 - self.basic_reads = basic_reads
25 - self.bamstats = bamstats
26 - self.depth = depth
27 15
28 16 def _gt_to_bases(self, gt: str, ref: str, alt: str) -> str | None:
29 17 if gt is None:

shuaizhou revised this gist 1 month ago. Go to revision

1 file changed, 214 insertions

parser_vcf.py(file created)

@@ -0,0 +1,214 @@
1 + import os
2 + import re
3 + import gzip
4 + from typing import Tuple, List
5 + import pandas as pd
6 +
7 + class QC:
8 + _DP_RE = re.compile(r"(?:^|;)\s*DP=(\d+(?:\.\d+)?)")
9 +
10 + def __init__(
11 + self,
12 + uid: str,
13 + library_num: int,
14 + output_qc: str,
15 + input_vcf: str,
16 + basic_reads: str,
17 + bamstats: str,
18 + depth: int
19 + ):
20 + self.uid = uid
21 + self.library_num = library_num
22 + self.output_qc = output_qc
23 + self.input_vcf = input_vcf
24 + self.basic_reads = basic_reads
25 + self.bamstats = bamstats
26 + self.depth = depth
27 +
28 + def _gt_to_bases(self, gt: str, ref: str, alt: str) -> str | None:
29 + if gt is None:
30 + return None
31 + gt = str(gt)
32 + if gt in {".", "./.", ".|."}:
33 + return None
34 +
35 + alts = [] if alt in {None, ".", ""} else str(alt).split(",")
36 + alleles = [ref] + alts # 0->REF, 1..->ALT
37 +
38 + if "/" in gt:
39 + parts = gt.split("/")
40 + elif "|" in gt:
41 + parts = gt.split("|")
42 + else:
43 + parts = [gt] # haploid like "1"
44 +
45 + bases = []
46 + for p in parts:
47 + if p in {".", ""}:
48 + bases.append(".")
49 + continue
50 + try:
51 + idx = int(p)
52 + except ValueError:
53 + return None
54 + if 0 <= idx < len(alleles):
55 + bases.append(alleles[idx])
56 + else:
57 + return None
58 +
59 + return "/".join(bases)
60 +
61 + @classmethod
62 + def dp_difference(cls, info: str, dp: int) -> float:
63 + """
64 + INFO列里的DP 与 sample列里的DP 差值(INFO - sample)
65 + """
66 + if info is None:
67 + return float(0 - dp)
68 + m = cls._DP_RE.search(str(info))
69 + if m:
70 + return float(m.group(1)) - dp
71 + return float(0 - dp)
72 +
73 + def parse_vcf_row(self, row) -> pd.Series:
74 + """
75 + row: tuple/list-like, 对应VCF的 0..9 列:
76 + 0 CHROM,1 POS,2 ID,3 REF,4 ALT,5 QUAL,6 FILTER,7 INFO,8 FORMAT,9 SAMPLE
77 + """
78 + chr_id = row[0]
79 + position = row[1]
80 + ref_base = row[3]
81 + alt_base = row[4]
82 +
83 + fmt_keys = str(row[8]).split(":")
84 + sample_vals = str(row[9]).split(":")
85 + fmt = dict(zip(fmt_keys, sample_vals))
86 +
87 + data = {"A": 0, "T": 0, "G": 0, "C": 0}
88 +
89 + gt = fmt.get("GT", None)
90 + gt_base = self._gt_to_bases(gt, ref_base, alt_base)
91 +
92 + # AD
93 + ad = fmt.get("AD", None)
94 + ad_sum = 0
95 + if ad not in {None, ".", ""}:
96 + try:
97 + ad_int = [int(i) for i in str(ad).split(",")]
98 + ad_sum = sum(ad_int)
99 +
100 + bases = [ref_base] if alt_base in {None, ".", ""} else [ref_base] + str(alt_base).split(",")
101 + for b, n in zip(bases, ad_int):
102 + if b in data: # only A/T/G/C
103 + data[b] = n
104 + except ValueError:
105 + ad = None
106 + ad_sum = 0
107 + else:
108 + ad = None
109 +
110 + # DP
111 + dp = 0
112 + v = fmt.get("DP", None)
113 + if v not in {None, ".", ""}:
114 + try:
115 + dp = int(v)
116 + except ValueError:
117 + dp = 0
118 +
119 + dp_diff = self.dp_difference(row[7], dp)
120 +
121 + # GQ / RGQ
122 + gq = None
123 + for k in ("GQ", "RGQ"):
124 + v = fmt.get(k, None)
125 + if v not in {None, ".", ""}:
126 + try:
127 + gq = float(v)
128 + except ValueError:
129 + gq = None
130 + break
131 +
132 + # PL
133 + pls = fmt.get("PL", None)
134 + if pls in {".", ""}:
135 + pls = None
136 +
137 + if ad_sum > 0:
138 + afA = data["A"] / ad_sum
139 + afT = data["T"] / ad_sum
140 + afG = data["G"] / ad_sum
141 + afC = data["C"] / ad_sum
142 + else:
143 + afA = afT = afG = afC = 0.0
144 +
145 + return pd.Series(
146 + [
147 + chr_id, position,
148 + data["A"], data["T"], data["G"], data["C"],
149 + afA, afT, afG, afC,
150 + gt, gt_base,
151 + ad, ad_sum,
152 + dp, dp_diff,
153 + gq, pls
154 + ],
155 + index=[
156 + "chr", "pos",
157 + "A", "T", "G", "C",
158 + "A_frac", "T_frac", "G_frac", "C_frac",
159 + "GT", "GT_base",
160 + "AD", "AD_sum",
161 + "DP", "DP_diff",
162 + "GQ", "PL"
163 + ]
164 + )
165 +
166 + def read_vcf(self, input_vcf: str) -> Tuple[pd.DataFrame, List[str], List[str]]:
167 + """
168 + 返回: (vcf_df, vcf_header(##), header_line(#CHROM...))
169 + """
170 + is_gzipped = input_vcf.endswith(".gz")
171 + open_func = gzip.open if is_gzipped else open
172 + mode = "rt" if is_gzipped else "r"
173 +
174 + vcf_header: List[str] = []
175 + header_line: List[str] = []
176 +
177 + # 逐行读header,避免 readlines() 占内存
178 + with open_func(input_vcf, mode) as f:
179 + for line in f:
180 + if line.startswith("##"):
181 + vcf_header.append(line.rstrip("\n"))
182 + continue
183 + if line.startswith("#CHROM"):
184 + header_line = line.strip().split()
185 + break
186 +
187 + # 主体用 pandas 读(comment='#' 会跳过所有header行)
188 + vcf_df = pd.read_csv(
189 + input_vcf,
190 + comment="#",
191 + sep="\t",
192 + header=None,
193 + compression="infer",
194 + dtype={0: str, 1: int, 3: str, 4: str, 7: str, 8: str, 9: str},
195 + low_memory=False,
196 + )
197 + return vcf_df, vcf_header, header_line
198 +
199 + def parse_vcf_df(self, vcf_df: pd.DataFrame) -> pd.DataFrame:
200 + """
201 + 只解析,不过滤。比 iterrows() 更快。
202 + """
203 + out = []
204 + # itertuples 更快;name=None 返回普通tuple
205 + for row in vcf_df.itertuples(index=False, name=None):
206 + out.append(self.parse_vcf_row(row))
207 + return pd.DataFrame(out)
208 +
209 + def main(self) -> pd.DataFrame:
210 + # logger.info(f"starting: {self.input_vcf}")
211 + vcf_df, vcf_header, header_line = self.read_vcf(self.input_vcf)
212 + parsed = self.parse_vcf_df(vcf_df)
213 + # logger.info(f"done: {self.input_vcf}")
214 + return parsed
Newer Older