import pdb
from collections import defaultdict
import requests
import time 
#from Bio import Entrez

#pdb.set_trace()
def count_tax_pub(taxon):
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=taxonomy&db=pmc&retmode=json&id="
    URL = base+str(taxon)    
    r = requests.get(URL) 
    i_link = []
    #check for empty results, see taxon 255787 for structure
    if 'linksetdbs' in r.json()['linksets'][0] :
       s_link = r.json()['linksets'][0]['linksetdbs'][0]['links']
       i_link = list(map(int,s_link))
    return i_link

# read duplicate genera, store lists of refs where those exist
dupeg = []
refs = defaultdict(list)
duplicates = "duplicate_genus_names.txt"
#duplicates = "minidupes"
fd = open (duplicates, "r")
for i, line in enumerate(fd):
    fields = line.split("\t")
    genus = fields[1]
    dupeg.append(int(genus))
    print("fetching ",genus)
    if (i % 3 == 0):
       time.sleep(1) 
    refs[int(genus)] = count_tax_pub(int(genus))
 
# retrieve lineage of genus, iterate upwards until there are enough pubs
lineage_lists = defaultdict(list)
#lineage = "taxftp/taxidlineage.dmp"
lineage = "taxftp/taxidlineage.dmp.duplicate_genera"
fd = open (lineage, "r")
for line in fd:
    tax_papers = list([tuple()])   # empty list of tuples
    fields = line.split("|")
    genus = int(fields[0])
    f2 = fields[1].split()
    f2i = list(map(int,f2))
    #f2t = list(map(lambda x: (tuple(x,0)),f2))
    f2t = [ (x,0) for x in f2i ] 
    threshold = 1000
    if genus in dupeg:
       #print("genus found ",genus)
       enough_pubs = 0
       for i, x in enumerate(reversed(f2i)):
           if (i%3 ==0):
              time.sleep(1)
           refs[x] = count_tax_pub(int(x))
           if len(refs[x]) > threshold :
              print("genus ",genus, len(refs[genus]), "done at taxon ",x,len(refs[x]) ) 
              difflist = list(set(refs[x]) -set(refs[genus]))
              fname = "abs/"+str(genus)+"/refs"+str(x)
              fout = open(fname, "w")
              for pmcid in difflist:
                  fout.write("{}\n".format(pmcid))
              break
             
    lineage_lists[genus].append(f2i)
 
#grep ^691691 
#elink -name taxonomy_pmc -id 131567 -target pmc -db taxonomy | efetch
#cat abs/1036680/refs147550 | tr '\n' ',' | xargs -I {}  echo "efetch -db pmc -format xml -id {}"  | sh > abs/1036680/refs0.xml
