#!/usr/bin/env Jython -J-Xmx8000m

from uk.ac.ebi.brain.core import Brain
from owltools.graph import OWLGraphWrapper

import re
import sys
import warnings
from urllib2 import urlopen
import os
    
def get_class_names(URL):
    """Use gen_imports_dict -rl.  -r specifies remote paths, -l specifies local ones. Returns list of class names(labels) and synonyms for any OWL ontology loaded.  All terms must have an RDFS:label for this to work.  """
        
        
    classNameList = []
    ont = Brain()
    ont.learn(URL)
    onto = ont.getOntology()
    ogw = OWLGraphWrapper(onto)

    ont_classes = ont.getSubClasses("Thing", 0)  # Should be able to replace query with request for classes in sig from owlapi ontology object
    for claz in ont_classes:
        classNameList.append(ont.getLabel(claz))
        obo_id = re.sub('_', ':', claz) 
        clazo = ogw.getOWLClassByIdentifier(obo_id) # Seems to want an OBO ID!
        l = [] # Note sure why this is necessary - method seems to want a list as second arg.
        syns  = ogw.getOBOSynonymStrings(clazo, [])
        #TODO - extend to pull in synonyms
        classNameList.extend(syns)
    ont.sleep()
    return classNameList

        
def get_uniq_wordList(phraseList):
    """ Takes a list of phrases and converts them into a dict where uniq words from the phrase list are keys.  Words in this context are entirely alphabetic and contain more than on char """
    word_dict = {}
    spec_char = re.compile("\W|\_|\d") # matches all non alphabetic chars
    single_letter = re.compile(" \w( |$)")  # For scrubbing single letters. May actually want to keep these...
    for phrase in phraseList:
        partially_scrubbed_phrase = re.sub(spec_char, " ", phrase) # replace all non alphabetic characters with a space
        scrubbed_phrase = re.sub(single_letter, ' ', partially_scrubbed_phrase)
        words = scrubbed_phrase.split() # Splits on any number of whitespace characters
        for word in words:
            word_dict[word]=1
    return word_dict

baseURI = ''

# Set baseURI for local vs remote mode based on 
if sys.argv[1] == '-l':
    baseURI = "file:///repos/go_trunk_ont/"
    OE_baseURI = "file:///Users/davidos/oboedit_config/dict/"
    GO_baseURI = "file:///repos/go_trunk_ont/editors/"
elif sys.argv[1] == '-r':
    baseURI = "http://purl.obolibrary.org/obo/go/"
    OE_base = "http://sourceforge.net/p/geneontology/svn/HEAD/tree/java/oboedit/trunk/src/org/oboedit/resources/"
    GO_baseURI = "http://viewvc.geneontology.org/viewvc/GO-SVN/trunk/ontology/editors/"
else:
    warnings.warn("Unrecognised arg %s.  This script takes a single arg: -l for local paths and -r for remote paths (URLs)." % sys.argv[1])


all_names = [] 
all_names.extend(get_class_names("http://purl.obolibrary.org/obo/chebi.owl")) # worth using all of chebi - as many spell check errors come from chemical names in terms and defs for which there is no chebi import.
#all_names.extend(get_class_names(baseURI+"extensions/chebi_import.owl"))
all_names.extend(get_class_names(baseURI+"extensions/ncbitaxon_import.owl"))
all_names.extend(get_class_names(baseURI+"extensions/cl_import.owl"))
all_names.extend(get_class_names(baseURI+"extensions/po_import.owl"))

uniq_wordList = get_uniq_wordList(all_names)

standard_dict_file = urlopen(OE_baseURI +'standard.dict', 'rU') # rU read only & copes with nasty windows line endings: http://docs.python.org/2/glossary.html#term-universal-newlines.
go_dict_file = urlopen(GO_baseURI + 'oboedit_user.dict', 'rU')


all_dict = {} # The keys of this dict make a uniq'd list of words in current OE dictionaries.
for word in go_dict_file:
    all_dict[word.rstrip()]=1  # rstrip whitespace, inc line ending. from end.
        
for word in standard_dict_file:
        all_dict[word.rstrip()] = 1

imports_dict_file = open('/repos/go_trunk_ont/editors/oboedit_go_imports.dict', "w")  # Filehandle for output file.
i = 0 # counter for new words added
for word in uniq_wordList:
    if word not in all_dict:
        imports_dict_file.write(word+"\n")  # Appending. Doesn't sort.  Does this matter?
        all_dict[word]=1
        i += 1
    
print "Regenerated imports dict.  Now contains %s words based on a review of %s words from import file labels and syonyms" % (i, len(uniq_wordList))

go_dict_file.close()
standard_dict_file.close()
