#!/usr/bin/env python

import sys, warnings
from os import path
import xml.etree.ElementTree as ET
from Bio import Entrez
from Bio import SeqIO

Entrez.email= 'bryn.dentinger@gmail.com'

def parse_label(xml_file): # parse JGI XML file about project to get species name
    tree = ET.parse(xml_file)
    tax_id = ''
    for elem in tree.iter(tag='file'):
        tax_id = ' '.join(elem.attrib['label'].split()[:2])
        break
    return tax_id    
    
def get_tax_id(species): # use Eutils to get taxon ID from species name
    species = species.replace(" ", "+").strip()
    try:
      search = Entrez.esearch(term = species, db = "taxonomy", retmode = "xml")
      record = Entrez.read(search)
      id = record['IdList'][0]
    except:
      warnings.warn("No taxid for %s" % species)
      id = ''
    return id
    
def change_names(seq_records, tax_id):  # generator to add taxID to fasta header of each sequence
    for seq_record in seq_records:
        seq_record.description = ''
        seq_record.id = '|'.join(['kraken:taxid', tax_id, seq_record.id])
        yield seq_record
        
infile = sys.argv[1]
species = ' '.join(sys.argv[2:4])
species = species.replace(' sp.', '')
tax_id = get_tax_id(species)

filename, file_extension = path.splitext(infile)
outfile = ''.join([filename, '_out', file_extension])
try:
    sequence_parser = SeqIO.parse(infile, "fasta")
    SeqIO.write(change_names(sequence_parser, tax_id), outfile, "fasta")        
except IOError:
    warnings.warn("%s: no such file" % infile)
