python – How should I format this regex to find this desired amino acid sequence?

input FGFR_fusion.fasta:

>fibroblast growth factor receptor 1 isoform
AAAAAAAAAE

with code (see Regex to match character in specific position and specific length ):

import re 

with open("FGFR_fusion.fasta",'r') as fasta:
    test = fasta.read()
    
    print('test : ', test)
    test2 = test.split(">")[1].split('\n')[1]
    
    print('test2 : ', test2)
    # del test[1]
    
    print('test2 : ', test2)
    
    
    out = open("PSB HW_6.txt", 'w')
    for x in test.split(">"):
        
        print('x : ', x)
        
        if 'fibroblast growth factor receptor 1 isoform' in x:
            
            print('test2 : ', test2)
            out.write(test2)  
            out.close()

with open("PSB HW_6.txt", 'r') as filtered:
    test2 = filtered.read()
    
    print('test2 : ', test2)
    
    out = open("10ID.txt", 'w')
    
    AA10 = re.compile('^.{9}[IVE]')
    matches = re.finditer(AA10,test2)
    
    print('matches : ', matches)

    found = [] 
    for match in matches: 
        print('match.group() : ', match.group())
        found.append(match.group())
        print(f"Found {match.group(0)} at position {match.start()}\n")
        out.write(match.group()[9])  
        out.close()
            
    if found == []:
            print("No match found.")

output :

test :  >fibroblast growth factor receptor 1 isoform
AAAAAAAAAE

    test2 :  AAAAAAAAAE
    test2 :  AAAAAAAAAE
    x :  
    x :  fibroblast growth factor receptor 1 isoform
    AAAAAAAAAE
    
    test2 :  AAAAAAAAAE
    test2 :  AAAAAAAAAE
    matches :  
    match.group() :  AAAAAAAAAE
    Found AAAAAAAAAE at position 0

and files :

PSB HW_6.txt : AAAAAAAAAE ;

10ID.txt : E

With Biopython, same input and outputs, code:

 import Bio

print('\n-------------------------------')

print('\n Biopython Version : ', Bio.__version__)

print('\n-------------------------------')

from Bio import SeqIO


with open("FGFR_fusion.fasta",'r') as fasta:
    test = SeqIO.parse(fasta, 'fasta') 
    
    print('test : ', test)
    
    for record in test:
        print('record : ', record)
        print('record.id : ', record.id , 'record.seq : ', record.seq)
        
        # for i in dir(record):
        #     print(i)

    
        out = open("PSB HW_6.txt", 'w')
        
        if 'fibroblast growth factor receptor 1 isoform' in record.description:
            
            print("record.description : ", record.description)
            print('OK')
            
            out.write(str(record.seq))  
            out.close()
            
    
        out = open("10ID.txt",  'w')
        
        found = [] 
    
        if record.seq[9] in ['I','V','E']:

            
            found.append(record.seq[9])
            print(f"Found {record.seq[9]} at position 10\n")
            out.write(record.seq[9])  
        out.close()
            
    if found == []:
            print("No match found.")

Read more here: Source link