python – How should I format this regex to find this desired amino acid sequence?
input FGFR_fusion.fasta:
>fibroblast growth factor receptor 1 isoform
AAAAAAAAAE
with code (see Regex to match character in specific position and specific length ):
import re
with open("FGFR_fusion.fasta",'r') as fasta:
test = fasta.read()
print('test : ', test)
test2 = test.split(">")[1].split('\n')[1]
print('test2 : ', test2)
# del test[1]
print('test2 : ', test2)
out = open("PSB HW_6.txt", 'w')
for x in test.split(">"):
print('x : ', x)
if 'fibroblast growth factor receptor 1 isoform' in x:
print('test2 : ', test2)
out.write(test2)
out.close()
with open("PSB HW_6.txt", 'r') as filtered:
test2 = filtered.read()
print('test2 : ', test2)
out = open("10ID.txt", 'w')
AA10 = re.compile('^.{9}[IVE]')
matches = re.finditer(AA10,test2)
print('matches : ', matches)
found = []
for match in matches:
print('match.group() : ', match.group())
found.append(match.group())
print(f"Found {match.group(0)} at position {match.start()}\n")
out.write(match.group()[9])
out.close()
if found == []:
print("No match found.")
output :
test : >fibroblast growth factor receptor 1 isoform
AAAAAAAAAE
test2 : AAAAAAAAAE
test2 : AAAAAAAAAE
x :
x : fibroblast growth factor receptor 1 isoform
AAAAAAAAAE
test2 : AAAAAAAAAE
test2 : AAAAAAAAAE
matches :
match.group() : AAAAAAAAAE
Found AAAAAAAAAE at position 0
and files :
PSB HW_6.txt : AAAAAAAAAE ;
10ID.txt : E
With Biopython, same input and outputs, code:
import Bio
print('\n-------------------------------')
print('\n Biopython Version : ', Bio.__version__)
print('\n-------------------------------')
from Bio import SeqIO
with open("FGFR_fusion.fasta",'r') as fasta:
test = SeqIO.parse(fasta, 'fasta')
print('test : ', test)
for record in test:
print('record : ', record)
print('record.id : ', record.id , 'record.seq : ', record.seq)
# for i in dir(record):
# print(i)
out = open("PSB HW_6.txt", 'w')
if 'fibroblast growth factor receptor 1 isoform' in record.description:
print("record.description : ", record.description)
print('OK')
out.write(str(record.seq))
out.close()
out = open("10ID.txt", 'w')
found = []
if record.seq[9] in ['I','V','E']:
found.append(record.seq[9])
print(f"Found {record.seq[9]} at position 10\n")
out.write(record.seq[9])
out.close()
if found == []:
print("No match found.")
Read more here: Source link
