#!/usr/bin/env python3
"""
Gather different information from a Galaxy server (by default https://usegalaxy.org)
and EDAM ontology (by default from http://edamontology.org/EDAM.owl)
"""
# Import ------------------------------
# General libraries
import os
import json
import logging
# External libraries
import requests
import rdflib
# Constant(s) ------------------------------
LOCAL_DATA = os.path.dirname(__file__) + "/data"
# Logger
LOGGER = logging.getLogger(__name__)
# Class(es) ------------------------------
[docs]class GalaxyInfo(object):
"""
Class to gather different information about a Galaxy instance.
By default, if the galaxy_url is None, information is loaded from local files
located in the `data/` folder corresponding to https://usegalaxy.org.
"""
[docs] def __init__(self, galaxy_url):
"""
:param galaxy_url: URL of the Galaxy instance.
:type galaxy_url: STRING
:class:`tooldog.edam_to_galaxy.GalaxyInfo` object is initialized with several
information from the given Galaxy instance. It contains:
:param self.version: version of the Galaxy instance.
:type self.version: STRING
:param self.edam_formats: mapping edam_format to LIST of extension of datatypes.
:type self.edam_formats: DICT
:param self.edam_data: mapping edam_data to LIST of extension of datatypes.
:type self.edam_data: DICT
:param self.hierarchy: class_to_classes part of the /api/mapping.json which maps
the parental classes of each classes.
:type self.hierarchy: DICT
:param self.class_names: ext_to_class_name part of the /api/mapping.json which
maps the extension of a datatype to its class in Galaxy.
:type self.class_names: DICT
"""
if galaxy_url is None:
self.galaxy_url = "https://usegalaxy.org"
LOGGER.info("Loading Galaxy info (https://usegalaxy.org) from " + LOCAL_DATA)
with open(LOCAL_DATA + "/edam_formats.json") as json_file:
api_edam_formats = json.load(json_file)
with open(LOCAL_DATA + "/edam_data.json") as json_file:
api_edam_data = json.load(json_file)
with open(LOCAL_DATA + "/mapping.json") as json_file:
mapping = json.load(json_file)
with open(LOCAL_DATA + "/version.json") as json_file:
version = json.load(json_file)
else:
self.galaxy_url = galaxy_url
LOGGER.info("Loading galaxy info from " + galaxy_url + "/api")
api_edam_formats = requests.get(galaxy_url + "/api/datatypes/edam_formats").json()
api_edam_data = requests.get(galaxy_url + "/api/datatypes/edam_data").json()
mapping = requests.get(galaxy_url + "/api/datatypes/mapping").json()
version = requests.get(galaxy_url + "/api/version").json()
# Get version of Galaxy instance
self.version = version['version_major']
def rev_dict(dictionnary):
"""
Reverse dictionnary key -> value to value -> LIST of key
"""
new_dict = {}
for key, value in dictionnary.items():
if value not in new_dict:
new_dict[value] = []
new_dict[value].append(key)
return new_dict
self.edam_formats = rev_dict(api_edam_formats)
self.edam_data = rev_dict(api_edam_data)
# Store hierarchy from mapping and class names
self.hierarchy = mapping["class_to_classes"]
self.class_names = mapping["ext_to_class_name"]
[docs] def select_root(self, datatypes):
"""
Select the root datatype from all given datatypes.
:param datatypes: list of different datatypes.
:type datatypes: list of STRING
:return: root datatype.
:rtype: STRING
"""
# Build class to ext dictionnary
class_to_ext = {}
for key, value in self.class_names.items():
if value not in class_to_ext:
class_to_ext[value] = []
class_to_ext[value].append(key)
# Create subdict of hierarchy
sub_dict = {}
for datatype in datatypes:
if datatype in self.class_names:
sub_dict[self.class_names[datatype]] = \
self.hierarchy[self.class_names[datatype]]
else:
LOGGER.warning(datatype + " was not found in the ext to class mapping. skipped")
# Remove class that inherit from both Binary and Text
datatype_to_remove = []
for key, value in sub_dict.items():
binary = 'galaxy.datatypes.binary.Binary' in value
text = 'galaxy.datatypes.data.Text' in value
if binary and text:
datatype_to_remove.append(key)
for key in datatype_to_remove:
del sub_dict[key]
LOGGER.debug(sub_dict)
# Find root
selected_class = None
root_dist = 100 # Set up huge root distance for comparison
for key, value in sub_dict.items():
for key, value in sub_dict.items():
if len(value) < root_dist:
root_dist = len(value)
selected_class = key
if selected_class is None:
LOGGER.warning("No best datatype found, return first datatype of the list")
return datatypes[0]
return class_to_ext[selected_class][0]
[docs]class EdamInfo(object):
"""
Contains the given EDAM ontology.
It is also possible to generate several dictionnaries to help interrogating the ontology
for a faster access.
"""
[docs] def __init__(self, edam_url):
"""
:param edam_url: path to EDAM.owl file
:type edam_url: STRING
All the EDAM ontology will be contained in a dictionnary (self.edam_ontology).
"""
if edam_url is None:
LOGGER.info("Loading EDAM info from http://edamontology.org/EDAM.owl")
self.edam_ontology = rdflib.Graph()
self.edam_ontology.parse("http://edamontology.org/EDAM.owl")
# Get version of EDAM ontology
version_query = """SELECT ?version WHERE {
<http://edamontology.org> doap:Version ?version}"""
for row in self.edam_ontology.query(version_query):
self.version = row[0]
break
else:
pass
[docs] def generate_hierarchy(self):
"""
Generates two dictionnaries of the EDAM hierarchy (format and data) with the following
structure:
DICT[edam_uri] -> LIST of edam_uri from parents
The dictionnary can be accessed via self.edam_format_hierarchy
"""
def make_hierarchy(query):
"""
Build hierarchy for a given query.
:return: generated hierarchy
:rtype: DICT
"""
hierarchy = {}
for row in self.edam_ontology.query(query):
uri = row[0].split('/')[-1]
p_uri = row[1].split('/')[-1]
if uri not in hierarchy:
hierarchy[uri] = []
hierarchy[uri].append(p_uri)
return hierarchy
formats_query = """SELECT ?format ?superformat WHERE {
?format rdfs:subClassOf ?superformat .
?superformat oboInOwl:inSubset
<http://purl.obolibrary.org/obo/edam#formats>
}"""
data_query = """SELECT ?data ?superdata WHERE {
?data rdfs:subClassOf ?superdata .
?superdata oboInOwl:inSubset
<http://purl.obolibrary.org/obo/edam#data>
}"""
self.edam_format_hierarchy = make_hierarchy(formats_query)
self.edam_data_hierarchy = make_hierarchy(data_query)
[docs]class EdamToGalaxy(object):
"""
Class to make the link between EDAM ontology terms (edam_format and edam_data) and Galaxy
datatypes.
"""
[docs] def __init__(self, galaxy_url=None, edam_url=None, mapping_json=None):
"""
:param galaxy_url: URL of the galaxy instance.
:type galaxy_url: STRING
:param edam_url: path to EDAM.owl file (URL or local path).
:type edam_url: STRING
:param mapping_json: path to personnalized EDAM mapping to Galaxy.
:type mapping_json: STRING
"""
if mapping_json is None:
if galaxy_url or edam_url:
mapping_json = 'edam_to_galaxy.json'
else:
mapping_json = LOCAL_DATA + "/edam_to_galaxy.json"
# Generates or Loads ?
if os.path.isfile(mapping_json):
self.load_local_mapping(mapping_json)
else:
# No local file exists, needs to generate it (takes a little bit of time)
self.edam = EdamInfo(edam_url)
self.edam_version = self.edam.version
self.edam.generate_hierarchy()
self.galaxy = GalaxyInfo(galaxy_url)
self.galaxy_url = self.galaxy.galaxy_url
self.galaxy_version = self.galaxy.version
self.generate_mapping()
self.export_info(mapping_json)
[docs] def generate_mapping(self):
"""
Generates mapping between edam_format and edam_data to Galaxy datatypes
based on the information of the Galaxy instance and the EDAM ontology.
Every edam_format and edam_data will be given a datatype.
"""
LOGGER.info("Generating new EDAM mapping to Galaxy datatypes file...")
def find_datatype(edam, edam_hierarchy, galaxy_mapping):
"""
Find the best datatype for a given EDAM term.
:param edam: EDAM term.
:type edam: STRING
:param edam_hierarchy: edam_hierarchy from :class:`tooldog.edam_to_galaxy.EdamInfo`
:type edam_hierarchy: DICT
:param galaxy_mapping: mapping from :class:`tooldog.edam_to_galaxy.GalaxyInfo`
:type galaxy_mapping: DICT
The function then create two dictionnaries: self.format_to_datatype and
self.data_to_datatype that represents a unique datatype for each EDAM term.
"""
if edam not in galaxy_mapping:
LOGGER.debug("No datatype found for " + edam + ". Looking at parental terms.")
if len(edam_hierarchy[edam]) > 1:
LOGGER.debug(edam + " inherits from more than one EDAM. " +
"Only first EDAM parent of the list is treated: " +
edam_hierarchy[edam][0])
elif len(edam_hierarchy[edam]) == 0:
LOGGER.debug("No parental EDAM found. " + edam + " is skipped.")
return "NO mapping"
datatype = find_datatype(edam_hierarchy[edam][0], edam_hierarchy,
galaxy_mapping)
elif len(galaxy_mapping[edam]) == 1:
LOGGER.debug("Exactly one datatype found for " + edam + ": " +
galaxy_mapping[edam][0])
datatype = galaxy_mapping[edam][0]
elif len(galaxy_mapping[edam]) > 1:
LOGGER.debug("More than one datatypes found for " + edam)
datatype = self.galaxy.select_root(galaxy_mapping[edam])
return datatype
def maps_datatype(edam_hierarchy, galaxy_mapping):
"""
Maps all edam terms to a Galaxy datatype.
:param edam_hierarchy: edam_hierarchy from :class:`tooldog.edam_to_galaxy.EdamInfo`
:type edam_hierarchy: DICT
:param galaxy_mapping: mapping from :class:`tooldog.edam_to_galaxy.GalaxyInfo`
:type galaxy_mapping: DICT
:return: mapping EDAM term to Galaxy datatype (unique mapping).
:rtype: DICT
"""
map_to_datatype = {}
for edam in edam_hierarchy.keys():
map_to_datatype[edam] = find_datatype(edam, edam_hierarchy, galaxy_mapping)
return map_to_datatype
# EDAM formats
self.format_to_datatype = maps_datatype(self.edam.edam_format_hierarchy,
self.galaxy.edam_formats)
# EDAM data
self.data_to_datatype = maps_datatype(self.edam.edam_data_hierarchy,
self.galaxy.edam_data)
[docs] def load_local_mapping(self, local_file):
"""
Method to load (from JSON file) mapping previously generated and exported in the
`local_file`.
:param local_file: path to the mapping local file.
:type local_file: STRING
"""
LOGGER.info("Loading EDAM mapping to Galaxy datatypes from " +
local_file)
with open(local_file, 'r') as file_path:
json_file = json.load(file_path)
self.format_to_datatype = json_file['format']
self.data_to_datatype = json_file['data']
self.galaxy_url = json_file['galaxy_url']
self.galaxy_version = json_file['galaxy_version']
self.edam_version = json_file['edam_version']
[docs] def export_info(self, export_file):
"""
Method to export mapping of this object to a JSON file.
:param export_file: path to the file.
:type export_file: STRING
"""
LOGGER.info("Exporting new EDAM mapping to Galaxy datatypes file to ./" +
export_file)
with open(export_file, 'w') as file_path:
json.dump({'format': self.format_to_datatype,
'data': self.data_to_datatype,
'edam_version': self.edam_version,
'galaxy_url': self.galaxy_url,
'galaxy_version': self.galaxy_version}, file_path)
[docs] def get_datatype(self, edam_data=None, edam_format=None):
"""
Get datatype from EDAM terms.
:param edam_data: EDAM data term.
:type edam_data: STRING
:param edam_format: EDAM format term.
:type edam_format: STRING
:return: datatype corresponding to given EDAM ontologies.
:rtype: STRING
"""
if edam_format is not None:
return self.format_to_datatype[edam_format]
elif edam_data is not None:
return self.data_to_datatype[edam_data]
else:
return "no EDAM given"