URL = ftp://ftp.ebi.ac.uk/pub/databases/interpro all: iprgo.obo ipr-core.obo domo.obo merged.obo unmatched.txt ## ---------------------------------------- ## InterPro-derived Ontology ## ---------------------------------------- ParentChildTreeFile.txt: wget $(URL)/ParentChildTreeFile.txt .PRECIOUS: ParentChildTreeFile.txt short_names.dat: wget $(URL)/$@ interpro.xml.gz: wget $(URL)/$@ .PRECIOUS: interpro.xml.gz interpro.xml: interpro.xml.gz gzip -dc $< > $@.tmp && mv $@.tmp $@ ipr-names.tbl: interpro.xml xmlstarlet sel -t -m "//interpro" -v '@id' -o " " -v '@short_name' -o " " -v @type --nl $< > $@ ipr-names.obo: ipr-all.tbl ./iprtbl2obo.pl $< > $@ # all of interpro in tabular format ipr-all.tbl: interpro.xml xmlstarlet sel -t -m "//interpro" -v '@id' -o "|" -v '@short_name' -o "|" -v 'name' -o "|" -v @type --nl $< > $@ # ipr-core is just the portion of InterPro that is part of the interpro hierarchy ipr-core.obo: ParentChildTreeFile.txt ./ipr-parent-child-to-obo.pl ParentChildTreeFile.txt > $@.tmp && mv $@.tmp $@ ## ---------------------------------------- ## GO-derived Ontology of domains ## ---------------------------------------- # IDspace: IPRGO # shadows the GO ID - each IPRGO ID is the implicit domain in GO GO_XP_PRO = ../../ontology/extensions/x-mf-protein.obo # protein binding node PB = GO:0005515 # iprgo-core.obo - IPRs classified under IPRGO (GO-derived domains and families) # # we use Marijn's file as a bridge between the GO-derived ontology and IPR; # each link is an is_a iprgo-core.obo: interpro_binding_edited.txt ./mapping2ont.pl $< > $@.tmp && mv $@.tmp $@ # go-binding-subset.obo - subset of GO under PB # # make a subset of GO with just descendants of GO:0019904 ! protein domain specific binding # (note this whole procedure works for larger subsets - e.g. using 'RNA polymerase binding' to make 'RNA polymerase' is_a enzyme #go-domain-binding-subset.obo: # blip ontol-query -r go -query "subclassT(ID,'GO:0019904')" -to obo > $@ go-binding-subset.obo: blip ontol-query -r go -i $(GO_XP_PRO) -query "subclassT(ID,'$(PB)'),\+genus(ID,_) " -to obo > $@.tmp && obo-grep.pl -r 'name: .* binding' $@.tmp > $@ # blip ontol-query -r go -query "subclassT(ID,'$(PB)') " -to obo > $@.tmp && obo-grep.pl -r 'name: .* binding' $@.tmp > $@ # blip ontol-query -r go -query "subclassT(ID,'$(PB)'),\+ (( subclassT(ID,X),subclass(X,'GO:0005488'),X\=$(PB))) " -to obo > $@.tmp && obo-grep.pl -r 'name: .* binding' $@.tmp > $@ # iprgo-derived.obo - shadow of GO PB hierarchy; extract implicit protein hierarchy # # E.g. # X binding # Y binding # # ==> X is_a Y # # Also uses IPR IDs in the definition to generate equivalence axioms; e.g. GO:0097162 MADS box domain binding has def xref InterPro:IPR002100 iprgo-derived.obo: go-binding-subset.obo ./go2ipr.pl $< > $@ ## -- ## next we combine ontologies and remove redundant edges ## -- # Ontologies to be combined IN = header.obo ipr-core.obo iprgo-core.obo iprgo-derived.obo ipr-names.obo # InterPro IDs take priority when merging # (merges generally result from definition xrefs in GO) MERGE = --merge-equivalence-sets -s IPRGO 1 -s IPR 10 -l IPRGO 9 -l IPR 2 iprgo-stage1.owl: $(IN) owltools $(IN) --merge-support-ontologies --reasoner elk $(MERGE) --assert-inferred-subclass-axioms --markIsInferred -o $@ iprgo-stage1.obo: iprgo-stage1.owl owltools $< -o -f obo --no-check $@ # only use names and exact syns iprgo-stage1-strict.obo: iprgo-stage1.obo egrep -v '(NARROW|RELATED|BROAD)' $< > $@ ## ---------------------------------------- ## Merging the two ontologies ## ---------------------------------------- # here we perform a more aggressive merge based on text matching # pairs matches using entity recognition # # note: we currently use the strict file (EXACT syns and labels only) matches-labeled.txt: iprgo-stage1-strict.obo blip-findall -u metadata_nlp -consult ignore.pro -i $< -goal index_entity_pair_label_match "entity_pair_label_reciprocal_best_intermatch(X,Y,S)" -no_pred -label -use_tabs | sort -u > $@.tmp && mv $@.tmp $@ matches.txt: matches-labeled.txt cut -f1,3 $< > $@ # translate pairs into OWL equivalence axioms matches.owl: matches.txt owltools --create-ontology test --parse-tsv -a EquivalentClasses $< -o $@ # iprgo - the almost final product # # note this includes *all* of interpro (plus any IPRGO groupings), perhaps around 26k classes iprgo.owl: iprgo-stage1.owl matches.owl owltools $< matches.owl --merge-support-ontologies --reasoner elk $(MERGE) --assert-inferred-subclass-axioms --markIsInferred -o $@ iprgo.obo: iprgo.owl owltools $< -o -f obo --no-check $@ # domo - final product; no orphans # # Remove all orphan interpros domo.obo: iprgo.obo obo-grep.pl -r '(is_a|IPR:000000)' $< | grep -v ^owl-axiom > $@.tmp && mv $@.tmp $@ # report on all that do not have a direct equivalent in IPR unmatched-domain.txt: domo.obo blip-findall -r go -i $< "class(ID),atom_concat('IPRGO:',Frag,ID),atom_concat('GO:',Frag,GID),subclassT(GID,'GO:0019904'),findall(Y,parent(ID,Y),Ys)" -select ID-Ys -label -no_pred | sort -u > $@ unmatched.txt: domo.obo blip-findall -r go -i $< "class(ID),atom_concat('IPRGO:',Frag,ID),findall(Y,parent(ID,Y),Ys)" -select ID-Ys -label -no_pred | sort -u > $@ matched.txt: domo.obo blip-findall -i $< "class(ID),id_idspace(ID,'IPR'),findall(Y,parent(ID,Y),Ys)" -select ID-Ys -label -no_pred > $@ # implicit GO-derived family/domain ontology classes with no IPR children; # for those that are not already leaf proteins (todo - PR integration) we should find the IPR classes no-ipr-children.txt: domo.obo blip-findall -i $< "class(C),id_idspace(C,'IPRGO'),\+subclass(_,C)" -select C -label | sort -u > $@ ## ---------------------------------------- ## Logical definitions of GO classes ## ---------------------------------------- # we use obol to parse GO labels of the form 'X binding' clean: rm iprgo* #new-domain.txt: # obol qobol-newterms -ontology GO -tag domain -subclass GO:0019904 > $@ x-domain-1.obo: domo.obo obol qobol -ontology GO -i $< -tag domain -subclass $(PB) -export obo > $@.tmp && cat $@.tmp | obo-grep.pl -r IPR - > $@ # obol qobol -ontology GO -i iprgo.obo -tag domain -subclass GO:0019904 -export obo > $@ x-domain.obo: x-domain-1.obo has_input.obo cat $^ > $@ x-domain.owl: x-domain.obo owltools $< -o $@ # this should now be in the ontology x-protein.obo: $(GO_XP_PRO) obol qobol -ontology GO -xont PR -i $< -newonly -tag protein -tag binding -subclass $(PB) -export obo > $@.tmp && mv $@.tmp $@ go-defined-classes.obo: ../../ontology/editors/gene_ontology_write.obo obo-grep.pl -r intersection_of: $< > $@ go-binding-subset-anc.obo: blip ontol-query -r go -query "subclassRT(ID,'$(PB)')" -to obo > $@ XIN = go-binding-subset-anc.obo x-domain.obo domo.obo merged.owl: $(XIN) owltools $(XIN) --merge-support-ontologies -o $@ merged.obo: merged.owl owltools $< -o -f obo --no-check $@ merged-inf.obo: merged.owl owltools $< --assert-inferred-subclass-axioms --markIsInferred -o -f obo --no-check $@ ## Protein to InterPro protein2ipr-%.csv: fetch-protein2ipr.sh $* $@ .PRECIOUS: protein2ipr-%.csv protein2ipr-%.ttl: protein2ipr-%.csv ./csv2ttl.pl $< > $@.tmp && mv $@.tmp $@ .PRECIOUS: protein2ipr-%.ttl protein2ipr-%.owl: protein2ipr-%.ttl owltools $< -o $@ protein2ipr-%.obo: protein2ipr-%.ttl owltools $< -o -f obo $@ merged-%.owl: protein2ipr-%.owl $(XIN) owltools $^ --merge-support-ontologies -o $@ merged-%.obo: merged-%.owl owltools $< --set-ontology-id $@ -o -f obo --no-check $@