Source code for projit.projit

from datetime import datetime
import pandas as pd
import numpy as np
import hashlib
import time
import json
import git
import re
import os

from .config import lock_file
from .config import config_file
from .config import execution_file
from .config import tag_file
from .config import config_folder
from .template import load_template
from .utils import locate_projit_config
from .pdf import PDF

##########################################################################################


[docs]
class Projit:
    """
    Projit Class.
    This is a data structure to contain the core elements of a data science 
    project. It will permit loose coupling between processes and experiments
    but provide a simple overarching structure for communication and 
    documentation.
    """

    def __init__(self, 
                 path, 
                 name, 
                 desc="", 
                 experiments=[], 
                 datasets={},
                 results={}, 
                 params={},
                 hyperparams={},
                 dataresults={},
                 executions={},
                 tags={}
    ):
        """
        Initialise a projit project object.
        This class will be used for storing and retrieving all data about the project,
        as well as ensuring that it is written to the projit meta-data files.

        :param path: The path to the project file.
        :type path: string, required

        :param name: The project name
        :type name: string, required

        :param desc: The project description
        :type desc: string, optional

        :param experiments: The array of experiments
        :type experiments: Array, optional

        :param datasets: The dictionary of datasets 'name':'path'
        :type datasets: Dictionary, optional

        :param results: The dictionary of results by experiment.
                        Structure: {'experiment':{'metric':'value'}}
        :type results: Dictionary of Dictionary, optional

        :param params: A dictionary of additional parameters share across experiments. 
                       For example: target variable name, identifier column.
        :type params: Dictionary, optional

        :param hyperparams: A dictionary of hyper parameters for experiments. 
                       Structure: {'experiment':{'param':'value', etc}} 
        :type hyperparams: Dictionary, optional

        :param dataresults: The dictionary of results on specific data sets.
                            These are used when you want your experimental results broken
                            down by the datasets. 
                            Structure: {'dataset':{'experiment':{'metric':'value'}}}
        :type dataresults: Dictionary of Dictionary of Dictionary, optional

        :param executions: The dictionary of experiment executions.
                            This structure is used to store all experimental runs.
                            The ID is a HASH of experiment_name and 
                            Structure: {'experiment_name':{
                                             'ID':{ 
                                                 'start':DATETIME, 
                                                 'end':DATETIME,
                                                 'githash':STRING, 
                                                 'params':DICT,
                                                 'hyperparams':DICT
                                              }
                                           }
                                       }
        :type executions: Dictionary of Dictionary of Dictionary, optional
 
        :param tags: The dictionary of tags for project assets.
        :type tags: Dictionary of Dictionary of Dictionary, optional

        :return: None 
        :rtype: None 
        """
        self.path = path
        self.name = name
        self.desc = desc
        self.experiments = experiments
        self.datasets = datasets
        self.results = results
        self.params = params
        self.hyperparams = hyperparams
        self.dataresults = dataresults
        self.executions = executions
        self.tags = tags



[docs]
    def get_root_path(self):
        """
        Get the path to where the project folder is located

        :return: path : The Path to the Project folder
        :rtype: String
        """
        return self.path[0:len(self.path) - len(config_folder)]




[docs]
    def start_experiment(self, name, path, params={}, tags={}):
        """
        Start an experiment execution.
        This function will create a new experiment if this is the first execution
        otherwise it will simply add a new execution record.
        Function returns an unique identifer for the execution: required
        to end the execution in a call to :meth:`projit.Projit.end_experiment`

        :param name: The experiment name (Unique Identifer)
        :type name: string, required

        :param path: The path to the experiment script being executed
        :type path: string, required

        :param params: Optional dictionary of parameters used in the experiment execution
        :type params: Dictionary, optional

        :param tags: Optional dictionary of tags to describe the experiment
        :type tags: Dictionary, optional

        :return: id : The Execution ID
        :rtype: String
        """
        self.initiate_lock()
        self.reload()

        if not self.experiment_exists(name):
            self.add_experiment(name, path)

        startdt = str(datetime.now())
        s = name + startdt
        id = hashlib.sha256(s.encode()).hexdigest()
        try:
            repo = git.Repo(search_parent_directories=True)
            ghash = repo.head.object.hexsha
        except git.exc.InvalidGitRepositoryError:
            ghash = ""
        payload = {'start':startdt, 'end':"", 'githash':ghash, 'params':params}
        exper_execs = {}

        if name in self.executions:
            exper_execs = self.executions[name]
        exper_execs[id] = payload
        self.executions[name] = exper_execs 
        self.save()
        self.release_lock()

        if len(tags)>0:
            self.add_tags("experiment", name, tags)

        return id




[docs]
    def end_experiment(self, name, id, hyperparams={}):
        """
        End an experiment execution.
        This function require both the experiment name and the hash ID of the previously started execution

        :param name: The experiment name (Unique Identifer)
        :type name: string, required

        :param id: The execution hash ID returned by the function: start_experiment 
        :type id: string, required

        :param hyperparams: Optional dictionary of hyperparameters used in the experiment execution
        :type path: Dictionary, option

        :return: None
        :rtype: None
        """

        if not self.experiment_exists(name):
            raise Exception(f"Projit Experiment Exception: Cannot end experiment: '{name}' -- Experiment not registered")
        
        self.initiate_lock()
        self.reload()
        if name in self.executions:
            exper_execs = self.executions[name]
        else:
            raise Exception(f"Projit Experiment Exception: Cannot end experiment: '{name}' -- Executions not started")

        if id in exper_execs:
            payload = exper_execs[id]
        else:
            raise Exception(f"Projit Experiment Exception: Cannot end experiment: '{name}' -- Executions not started")

        payload['end'] = str(datetime.now())
        payload['hyperparams'] = hyperparams
        exper_execs[id] = payload
        self.executions[name] = exper_execs
        self.save()
        self.release_lock()    




[docs]
    def get_experiment_execution_stats(self, name):
        """
        Given an experiment name
        Return the execution statistics

        :param name: The experiment name (Unique Identifer)
        :type name: string, required

        :return: executions, mean_execution_time : A pair of statistics
        :rtype: int, float
        """
        if name in self.executions:
            exec_times = self.get_execution_times(name)
            if len(exec_times) > 0:
                return len(exec_times), np.mean(exec_times)
            else:
                return 0, 0
        else:
            return 0, 0




[docs]
    def get_mean_execution_time(self, name):
        """
        Given an experiment name
        Return the mean execution time

        :param name: The experiment name (Unique Identifer)
        :type name: string, required

        :return: mean_execution_time : The mean time of execution
        :rtype: float
        """
        exec_times = self.get_execution_times(name)
        if len(exec_times) > 0:
            return np.mean(exec_times)
        else:
            return 0




[docs]
    def get_execution_times(self, name):
        """
        Given an experiment name
        Return an list of all  execution times

        :param name: The experiment name (Unique Identifer)
        :type name: string, required

        :return: execution_times : Array of execution times
        :rtype: list(float)
        """
        if name in self.executions:
            exec_times = []
            for execid, exec in self.executions[name].items():
                if exec["end"] != "":
                    a = datetime.strptime(exec["start"], '%Y-%m-%d %H:%M:%S.%f')
                    b = datetime.strptime(exec["end"], '%Y-%m-%d %H:%M:%S.%f')
                    diff = (b-a).seconds
                    exec_times.append(diff)
            return exec_times
        else:
            return []




[docs]
    def add_experiment(self, name, path):
        """
        Add information of a new experiment to the project. 
        Then save the project configuration.
        This function will overwrite an experiment of the same name
        and delete any previous results.

        :param name: The experiment name
        :type name: string, required

        :param path: The path to the experiment.
        :type path: string, required

        :return: None
        :rtype: None
        """
        self.initiate_lock()
        self.reload()
        for elem in self.experiments: 
            if elem[0] == name:
                self.experiments.remove(elem)
                self.clean_experimental_results(name)
        self.experiments.append( (name, path) )
        self.save()
        self.release_lock()




[docs]
    def update_name_description(self, name, descrip):
        """
        Update the core values name and description

        :param name: The project name 
        :type name: string, required

        :param descrip: The project description 
        :type descrip: string, required

        :return: None
        :rtype: None
        """
        self.initiate_lock()
        self.reload()
        self.name = name
        self.desc = descrip
        self.save()
        self.release_lock()




[docs]
    def dataset_exists(self, name):
        """
        Check if a given dataset is in the data structure

        :param name: The dataset name
        :type name: string, required

        :return: exists
        :rtype: Boolean
        """
        for elem in self.datasets:
            if elem == name:
                return True
        return False




[docs]
    def experiment_exists(self, name):
        """
        Check if a given experiment is in the data structure

        :param name: The experiment name
        :type name: string, required

        :return: exists
        :rtype: Boolean
        """
        for elem in self.experiments:
            if elem[0] == name:
                return True
        return False

 


[docs]
    def validate_asset(self, asset, name):
        """
        Check if a given asset exists

        :param asset: The asset type (experiment|dataset)
        :type asset: string, required

        :param name: The asset name
        :type name: string, required

        :return: exists
        :rtype: Boolean
        """
        if asset=="experiment":
            return self.experiment_exists(name)
        elif asset=="dataset":
            return self.dataset_exists(name)
        else:
            return False




[docs]
    def add_tags(self, asset, name, tags):
        """
        Add tags to a specific asset

        :param asset: The asset type (experiment|dataset)
        :type asset: string, required

        :param name: The asset name
        :type name: string, required

        :param tags: The distionary of tags
        :type tags: Dictionary(string:string)

        :return: None
        :rtype: None
        """
        self.initiate_lock()
        self.reload()
        assets = {}
        assets[name] = {}
        if asset in self.tags:
            assets = self.tags[asset]
            if name not in assets:
                assets[name] = {}
        for tag in tags:
            assets[name][tag] = tags[tag]

        self.tags[asset] = assets
        self.save()
        self.release_lock()




[docs]
    def get_tags(self, asset, name, tags):
        """
        Retrive specified tags to a specific asset
        Returns the list of tag values in the same order as requested.        

        :param asset: The asset type (experiment|dataset)
        :type asset: string, required

        :param name: The asset name
        :type name: string, required

        :param tags: The list of tags
        :type tags: list(string)

        :return: tags
        :rtype: list(string)
        """
        if asset in self.tags:
            assets = self.tags[asset]
            if name not in assets:
                return ["" for t in tags]
            else:
                my_asset = assets[name]
                tag_set = []
                for t in tags:
                    if t in my_asset:
                        tag_set.append(my_asset[t])
                    else:
                        tag_set.append("")
                return tag_set
        else:
            return ["" for t in tags]




[docs]
    def clean_experimental_results(self, name):
        """
        Remove all results for a given experiment

        :param name: The experiment name
        :type name: string, required

        :return: None
        :rtype: None
        """
        if name in self.results:
            del self.results[name]
        for dataset in self.dataresults:
            if name in self.dataresults[dataset]:
                del self.dataresults[dataset][name]




[docs]
    def add_dataset(self, name, path):
        """
        Add a named dataset to the project.

        :param name: The dataset name
        :type name: string, required

        :param path: The path to the data set (either local path, URL or S3 Bucket)
        :type path: string, required

        :return: None
        :rtype: None
        """
        self.initiate_lock()
        self.reload()
        self.datasets[name] = path
        self.save()
        self.release_lock()




[docs]
    def rm_dataset(self, name):
        """
        Remove a named dataset to the project.

        :param name: The dataset name (or '.' for all datasets)
        :type path: string, required

        :return: None
        :rtype: None
        """
        self.initiate_lock()
        self.reload()
        if name in self.datasets:
            del self.datasets[name] 
            self.save()
        elif name==".":
            del self.datasets
            self.datasets = {}
            self.save()
        self.release_lock()




[docs]
    def rm_experiment(self, name):
        """
        Remove a named experiment from the project.

        :param name: The experiment name (or '.' for all experiments)
        :type path: string, required

        :return: None
        :rtype: None
        """
        self.initiate_lock()
        self.reload()
        if name==".":
            for elem in self.experiments:
                self.clean_experimental_results(elem[0])
            self.experiments = []
            self.save()
        else:
            for elem in self.experiments:
                if elem[0] == name:
                    self.experiments.remove(elem)
                    self.clean_experimental_results(name)
            self.save()
        self.release_lock()




[docs]
    def add_param(self, name, value):
        """
        Add a parameter to the project.

        :param name: The parameter name
        :type name: string, required

        :param value: The value taken by that parameter
        :type value: Any

        :return: None
        :rtype: None
        """
        self.initiate_lock()
        self.reload()
        self.params[name] = value 
        self.save()
        self.release_lock()




[docs]
    def add_hyperparam(self, name, value):
        """
        Add a set of hyper parameters to the project.

        :param name: The experiment name
        :type name: string, required

        :param value: The Dictionary of hyperparameters
        :type value: Dictionary

        :return: None
        :rtype: None
        """
        if self.experiment_exists(name):
            self.initiate_lock()
            self.reload()
            self.hyperparams[name] = value
            self.save()
            self.release_lock()
        else:
            raise Exception("Projit Experiment Exception: No experiment called: '%s' -- Register your experiment first." % name)




[docs]
    def add_result(self, experiment, metric, value, dataset=None):
        """
        Add results from an experiment to the project.

        They can be overall project results, or associated with a specific dataset

        :param name: The experiment name
        :type name: string, required

        :param metric: The name of the metric we are adding.
        :type metric: string, required

        :param value: The value of the metric to add.
        :type value: float, required

        :param dataset: The dataset against which the results are generated
        :type dataset: string, optional 

        :return: None
        :rtype: None
        """

        self.initiate_lock()
        self.reload()
        if dataset==None:
            if experiment in self.results:
                rez = self.results[experiment]
            else:
                rez = {}
            rez[metric] = value
            self.results[experiment] = rez
        else: 
            if dataset in self.dataresults:
                rez = self.dataresults[dataset]
            else:
                rez = {}
            if experiment in rez:
                rez2 = rez[experiment]
            else:
                rez2 = {}
            rez2[metric] = value
            rez[experiment] = rez2
            self.dataresults[dataset] = rez
        self.save()
        self.release_lock()




[docs]
    def get_results(self, dataset=None):
        """
        Retrieve the experimental results as a DataFrame.

        They can be overall project results, or associated with a specific dataset

        :param dataset: The dataset against which the results are generated
        :type dataset: string, optional

        :return: DataFrame of results
        :rtype: pandas.DataFrame
        """

        df = pd.DataFrame()
        if dataset==None:
            myresults = self.results
        else:
            if dataset in self.dataresults:
                myresults = self.dataresults[dataset]
            else:
                raise Exception("Projit Dataset Exception: No results for dataset: %s " % dataset)
        for exp in self.experiments:
            key = exp[0]
            if key in myresults:
                rez = myresults[key]
            else:
                rez = {}
            rez['experiment'] = key
            df = pd.concat([df, pd.DataFrame(rez, index=[0])], ignore_index=True)
        # Ensure that the first column in the results is "experiments"
        cols = ["experiment"]
        rest = df.columns.to_list()
        rest.remove('experiment')
        cols.extend(rest)
        return df.loc[:,cols]




[docs]
    def get_dataset(self, name):
        """
        Retrieve the dataset by name.

        :param name: The dataset to retrieve
        :type name: string, required

        :return: Path to dataset
        :rtype: String
        """
        if name in self.datasets:
            return self.datasets[name]
        else:
            raise Exception("Projit Dataset Exception: Named dataset '%s' not available. Register your dataset" % name)




[docs]
    def get_param(self, name):
        if name in self.params:
            return self.params[name]
        else:
            raise Exception("Projit Parameter Exception: Named parameter '%s' is not available:" % name)




[docs]
    def get_hyperparam(self, name):
        if name in self.hyperparams:
            return self.hyperparams[name]
        else:
            raise Exception("Projit Parameter Exception: Hyper parameters for experiemnt '%s' are not available:" % name)




[docs]
    def get_path_to_dataset(self, name):
        ds = self.get_dataset(name)
        if self.is_complete_path(ds):
            return ds
        else:
            return self.create_local_path(ds)




[docs]
    def is_complete_path(self, path):
        if path[0:1] == "/":
            return True
        if path[0:3] == "s3:":
            return True
        if path[0:4] == "http":
            return True
        return False




[docs]
    def create_local_path(self, ds):
        """
        Create and return a path to a dataset. Internal use.

        :return: Path to dataset
        :rtype: String
        """
        return self.get_root_path() + ds




[docs]
    def initiate_lock(self):
        """
        Lock files are used during processes that modify the project
        so that we get consistent state across parallel executions.

        :return: None
        :rtype: None
        """
        path_to_lock = self.path + "/" + lock_file
        lock_exists = True
        while lock_exists:
            if os.path.isfile(path_to_lock):
                time.sleep(5)
            else:
                lock_exists = False

        lock_content = {}
        with open(path_to_lock, 'w') as outfile:
            json.dump(lock_content, outfile, indent=0)




[docs]
    def release_lock(self):
        """
        Lock files are used during processes that modify the project
        so that we get consistent state across parallel executions.
        Release the lock by deleting the lock file

        :return: None
        :rtype: None
        """
        path_to_lock = self.path + "/" + lock_file
        if os.path.isfile(path_to_lock):
            os.remove(path_to_lock)




[docs]
    def save(self):
        """
        Save your projit project into config files within the projit config dir

        :return: None
        :rtype: None
        """
        core_props = self.__dict__.copy()
        del core_props['executions']
        del core_props['tags']
        path_to_json = self.path + "/" + config_file
        with open(path_to_json, 'w') as outfile:
            json.dump(core_props, outfile, indent=0)

        path_to_json = self.path + "/" + execution_file
        with open(path_to_json, 'w') as outfile:
            json.dump(self.executions, outfile, indent=0)

        path_to_json = self.path + "/" + tag_file
        with open(path_to_json, 'w') as outfile:
            json.dump(self.tags, outfile, indent=0)




[docs]
    def reload(self):
        """
        Reload the project meta-data from disk. 
        - Necessary when multiple processes are running experiments 
        in the same project and we want to avoid overwriting data.

        :return: None
        :rtype: None
        """
        path_to_config = self.path + "/" + config_file
        path_to_execs = self.path + "/" + execution_file
        path_to_tags = self.path + "/" + tag_file
        _dict = {}
        if os.path.exists(path_to_config):
            with open(path_to_config) as f:
                _dict = json.load(f)
        for key in _dict.keys():
            setattr(self, key, _dict[key])

        _execs = {}
        if os.path.exists(path_to_execs):
            with open(path_to_execs) as f:
                _execs = json.load(f)
                setattr(self, "executions", _execs)

        _tags = {}
        if os.path.exists(path_to_tags):
            with open(path_to_tags) as f:
                _tags = json.load(f)
                setattr(self, "tags", _tags)




[docs]
    def render(self, path):
        """
        Render the project data into a PDF file

        :param path: The path to write the PF to 
        :type path: string, required

        :return: None
        :rtype: None        
        """
        results = self.get_results()
        pdf = PDF()
        pdf.setup()
        pdf.add_title(self.name)
        pdf.add_description(self.desc)
        pdf.output(path, 'F')




##########################################################################################

[docs]
def load(config_path):
    """
    This function allows you to instantiate a Projit project from an existing config_path
    The config path must contain the required config file that contains the required fields.

    Note: This function will always overwrite the path variable in the object so the instance
    is aware of where it is relative to the config directory.

    :param config_path: The path to the projit configuration
    :type config_path: string, required

    :return: Projit Object
    :rtype: Projit
    """
    _dict = {}
    path_to_json = config_path + "/" + config_file
    if os.path.exists(path_to_json):
        with open(path_to_json) as f:
            _dict = json.load(f)

    _execs = {}
    path_to_execs = config_path + "/" + execution_file
    if os.path.exists(path_to_execs):
        with open(path_to_execs) as f:
            _execs['executions'] = json.load(f)

    _tags = {}
    path_to_tags = config_path + "/" + tag_file
    if os.path.exists(path_to_tags):
        with open(path_to_tags) as f:
            _tags["tags"] = json.load(f)

    _object = Projit(**_dict, **_execs, **_tags )
    _object.path = config_path
    return _object



##########################################################################################

[docs]
def projit_load():
    """
    Load the project by first locating the config file and using it to
     initialise the projit Project class.

    :return: Projit Object
    :rtype: Projit
    """
    return load( locate_projit_config() )



##########################################################################################

[docs]
def init(template, name, desc=""):
    """
    Initialise a new projit project.
    Create the config directory and write the project config there.

    :param name: The name of the project
    :type name: string, required

    :param desc: The project description
    :type desc: string, required

    :return: Projit Object
    :rtype: Projit
    """
    os.mkdir(config_folder)
    project = Projit(config_folder, name, desc)
    project.save()
    init_template(template)
    return project


##########################################################################################


[docs]
def init_template(template):
    """
    Initialise a project from a specified template
    """
    if template != "":
        temp = load_template(template)
        for d in temp['dirs']:
            if not os.path.isdir(d):
                os.mkdir(d)