提取细菌基因组ORF思路:
1.通过FNA文件得到细菌基因组序列
2.分正负链和三个相位共6种情况统计ORF
3.写入文件
转载请保留出处!
贴上Python代码(版本:3.6)
1 # -*- coding: utf-8 -*- 2 """ 3 Created on Thu Dec 14 13:19:00 2017 4 5 @author: zxzhu 6 """ 7 8 import re 9 def N2M(sequence): #正负链转换10 hash = { 'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C','N':'N'}11 sequence = ''.join([hash[i] for i in sequence]) 12 return sequence[::-1]13 14 def translate(seq): #将序列转换为起始,终止,其他密码子15 pa1 = re.compile(r'TAA|TAG|TGA')16 after_trans = ''17 for i in range(0,len(seq),3):18 if seq[i:i+3]=='ATG':19 after_trans+='I'20 elif pa1.match(seq[i:i+3]):21 after_trans+='T'22 else:23 after_trans+='O'24 return after_trans25 26 def get_orf(seq,length=90):27 pa2 = re.compile(r'I[IO]+?T') #匹配模式:起始1非终止1~N终止128 trans_seq = translate(seq)29 m = pa2.finditer(trans_seq) #所有匹配结果的迭代30 index = []31 orf = []32 for i in m:33 index.append(i.span()) #序列起始,终止位置34 for i in index:35 orf_start = i[0]*336 orf_end = i[1]*337 #print(orf_start,orf_end)38 if orf_end - orf_start >= length: #不小于90bp39 orf.append(seq[orf_start:orf_end])40 return orf41 42 def Seq2AA(sequence,hash): #翻译为AA序列43 AA='' 44 for i in range(0, len(sequence) - 3, 3):45 AA += hash[sequence[i:i + 3]]46 return AA47 48 def main(fna,length=90):49 fn = open(fna)50 pa = re.compile(r'\s+')51 hash_seq = {} # CDS hash,CDS2sequence52 result1 = open('orf_seq.txt','w')53 result2 = open('orf_AA.txt','w')54 start = [0,1,2] #相位55 strain = '+-' #正负链56 hash_AA = {} # AA hash,sequence2AA57 with open('AA.txt', 'r') as f: #AA.txt 为密码子表58 for line in f:59 line = line.strip()60 if line:61 line = pa.split(line)62 hash_AA[line[0]] = line[1] #AA hash63 64 for line in fn: #获取序列65 line = line.strip()66 if line.startswith('>'):67 A = pa.split(line)[0].replace('>', '')68 hash_seq[A] = ''69 else:70 hash_seq[A] += line71 72 for key in hash_seq.keys(): #分+-链,3个相位统计ORF73 seq = hash_seq[key]74 for r in strain:75 if r == '-':76 seq = N2M(seq)77 for s in start:78 seq = seq[s:]79 #trans_seq = translate(seq)80 orf = get_orf(seq)81 for i in orf:82 if 'N' not in i: #去除N83 AA =Seq2AA(i,hash_AA) 84 result1.write('>'+key+'\t'+r+'\t'+str(s)+'\n'+i+'\n')85 result2.write('>'+key+'\t'+r+'\t'+str(s)+'\n'+AA+'\n')86 fn.close()87 result1.close()88 result2.close()89 90 91 fna = 'GCA_000160075.2_ASM16007v2_genomic.fna'92 main(fna)
NCBI可以找ORF,很方便。码一下: