Source code for spacekit.skopes.hst.svm.train

"""
This module builds, trains, and evaluates an ensemble model for labeled and preprocessed SVM regression test data and alignment
images. The ensemble model is a combination of two neural networks: a MultiLayerPerceptron (for regression test data) and a 3D
Image Convolutional Neural Network (CNN). The script includes functions for the following steps:

1. load and prep the data and images for ML
2. build and train the model
3. compute results and save to disk

This script (and/or its functions) should be used in conjunction with spacekit.skopes.hst.svm.prep if using raw data (since both
the regression test dataframe for MLP and the png images for the CNN need to be created first). Once a model has been trained
using this script, it is saved to disk and can be loaded again later for use with the predict script (spacekit.skopes.hst.svm.
predict).
"""

import os
import argparse
import datetime as dt
from spacekit.extractor.load import load_datasets, SVMImageIO
from spacekit.generator.augment import training_data_aug, training_img_aug
from spacekit.preprocessor.transform import (
    normalize_training_data,
    normalize_training_images,
)
from spacekit.builder.architect import BuilderEnsemble
from spacekit.analyzer.compute import ComputeBinary

DIM = 3
CH = 3
WIDTH = 128
HEIGHT = 128
DEPTH = DIM * CH
SHAPE = (DIM, WIDTH, HEIGHT, CH)
TF_CPP_MIN_LOG_LEVEL = 2



[docs]
def make_ensembles(
    train_data,
    train_img,
    train_label,
    test_data,
    test_img,
    test_label,
    val_data=None,
    val_img=None,
    val_label=None,
):
    """Creates tupled pairs of regression test (MLP) data and image (CNN) array inputs for an ensemble model.

    Parameters
    ----------
    train_data : numpy array
        training set feature data inputs
    train_img : numpy array
        training set image inputs
    train_label : numpy array
        training set target values
    test_data : numpy array
        test set feature data inputs
    test_img : numpy array
        test set image inputs
    test_label : numpy array
        test set target values
    val_data : numpy array, optional
        validation set feature data inputs
    val_img : numpy array, optional
        validation set image inputs
    val_label : numpy array, optional
        validation set target values

    Returns
    -------
    tuples of 6 ndarrays (only 4 if validation kwargs are None)
        XTR, YTR, XTS, YTS, XVL, YVL
        List/tuple of feature input arrays (data, img) and target values for train-test-val sets
    """
    print("Stacking mixed inputs (DATA + IMG)")
    XTR = [train_data, train_img]
    YTR = train_label.reshape(-1, 1)
    XTS = [test_data, test_img]
    YTS = test_label.reshape(-1, 1)
    if val_data is not None:
        XVL = [val_data, val_img]
        YVL = val_label.reshape(-1, 1)
        return XTR, YTR, XTS, YTS, XVL, YVL
    else:
        return XTR, YTR, XTS, YTS




[docs]
def load_ensemble_data(filename, img_path, img_size=128, dim=3, ch=3, norm=0, v=0.85, output_path=None):
    """Loads regression test data from a csv file and image data from png files. Splits the data into train, test and validation
    sets, applies normalization (if norm=1), creates a maste index of the original dataset input names, and stacks the features
    and class targets for both data types into lists which can be used as inputs for an ensemble model.

    Parameters
    ----------
    filename : str
        path to preprocessed dataframe csv file
    img_path : str
        path to png images parent directory
    img_size: int, optional
        image size (single value assigned to width and height), by default 128
    dim: int, optional
        dimensions (or volume) of image frames per image (for 3D CNN), by default 3
    ch: int, optional
        channels (rgb is 3, grayscale is 1), by default 3
    norm : bool, optional
        apply normalization step, by default 0
    v: float, optional
        validation set ratio for evaluating model, by default 0.85
    output_path: str, optional
        where to save the outputs (defaults to current working directory), by default None

    Returns
    -------
    list, ndarrays
        tv_idx, XTR, YTR, XTS, YTS, XVL, YVL
        list of test-validation indices, train-test feature (X) and target (y) numpy arrays.
    """
    # LOAD MLP and CNN DATA
    print("[i] Importing Regression Test Data")
    df = load_datasets([filename])
    print("\tREG DATA: ", df.shape)
    print(f"\nClass Labels (0=Aligned, 1=Misaligned)\n{df['label'].value_counts()}")

    (X, y), (train, test, val) = SVMImageIO(img_path, w=img_size, h=img_size, d=dim * ch, inference=False, data=df, v=v).load()

    # DATA AUGMENTATION
    print("\nPerforming Regression Data Augmentation")
    X_train, _ = training_data_aug(X[0], y[0])

    # IMAGE AUGMENTATION
    print("\nPerforming Image Data Augmentation")
    img_idx, (X_tr, y_tr), (X_ts, y_ts), (X_vl, y_vl) = training_img_aug(train, test, val=val)

    # NORMALIZATION and SCALING
    if norm:
        cols = ["numexp", "rms_ra", "rms_dec", "nmatches", "point", "segment", "gaia"]
        X_train, X_test, X_val = normalize_training_data(
            df, cols, X_train, X[1], X_val=X[2], output_path=output_path, rename=None
        )
        X_tr, X_ts, X_vl = normalize_training_images(X_tr, X_ts, X_vl=X_vl)
    else:
        X_test, X_val = X[1], X[2]
    # JOIN INPUTS: MLP + CNN
    XTR, YTR, XTS, YTS, XVL, YVL = make_ensembles(
        X_train,
        X_tr,
        y_tr,
        X_test,
        X_ts,
        y_ts,
        val_data=X_val,
        val_img=X_vl,
        val_label=y_vl,
    )
    tv_idx = [y[1], y[2], img_idx]
    return tv_idx, XTR, YTR, XTS, YTS, XVL, YVL




[docs]
def train_ensemble(
    XTR,
    YTR,
    XTS,
    YTS,
    model_name="ensembleSVM",
    params=None,
    output_path=None,
    keras=True,
):
    """Build, compile and fit an ensemble model with regression test data and image input arrays.

    Parameters
    ----------
    XTR : tuple/list
        training set feature (X) tuple of regression data and image data numpy arrays.
    YTR : numpy array
        training set target values
    XTS : tuple/list
        test set feature (X) tuple of regression data and image data numpy arrays.
    YTS : numpy array
        test set target values
    model_name : str, optional
        name of model, by default "ensembleSVM"
    params : dict, optional
        custom parameters for model fitting, by default None
    output_path : str, optional
        custom path for saving model, results, by default None (current working directory)

    Returns
    -------
    spacekit.builder.networks.Ensemble model object
        Builder ensemble subclass model object trained on the inputs
    """
    if params is None:
        params = dict(
            batch_size=32,
            epochs=60,
            lr=1e-4,
            decay=[100000, 0.96],
            early_stopping=None,
            verbose=1,
            ensemble=True,
        )
    ens = BuilderEnsemble(
        X_train=XTR,
        y_train=YTR,
        X_test=XTS,
        y_test=YTS,
        params=params,
        input_name="svm_mixed_inputs",
        output_name="svm_output",
        name=model_name,
    )
    ens.build()
    ens.batch_fit()
    if output_path is None:
        output_path = os.getcwd()
    ens.save_model(weights=True, output_path=output_path, parent_dir="svm_align", keras_archive=keras)
    return ens




[docs]
def compute_results(ens, tv_idx, val_set=(), output_path=None):
    """Creates Compute objects of test and validation sets for model evaluation and saves calculated results to disk for later
    analysis. Validation set is a subset of data that has not been seen by the model and is necessary for measuring robustness.

    Parameters
    ----------
    ens : builder.networks.Ensemble
        ensemble model builder object
    tv_idx : tuple or list of Pandas Series
        test and validation indices (used for FNFP analysis)
    val_set: tuple or list of arrays
        validation set (X_val, y_val) of features and target arrays.
    output_path : str, optional
        custom path for saving model, results, by default None (current working directory)

    Returns
    -------
    spacekit.analyzer.compute.Computer objects
        Test and Validation computer objects (if val_set is left empty, returns only a single Com obj)
    """
    if output_path is None:
        output_path = os.getcwd()
    res_path = os.path.join(output_path, "results")
    # test set
    ens.test_idx = tv_idx[0]
    com = ComputeBinary(builder=ens, res_path=f"{res_path}/test")
    com.calculate_results()
    _ = com.make_outputs()
    # validation set
    if len(val_set) == 2 and val_set[0][0].shape[0] > 2:  # temp (ignores test data)
        (ens.X_val, ens.y_val), ens.test_idx = val_set, tv_idx[1]
        val = ComputeBinary(builder=ens, res_path=f"{res_path}/val", validation=True)
        val.calculate_results()
        _ = val.make_outputs()
    else:
        val = None
    return com, val




[docs]
def run_training(
    data_file,
    img_path,
    img_size=128,
    norm=0,
    v=0.85,
    model_name="ensembleSVM",
    params=None,
    output_path=None,
    keras=True,
):
    """Main calling function to load and prep the data, train the model, compute results and save to disk.

    Parameters
    ----------
    data_file : str (path)
        path to preprocessed dataframe csv file
    img_path : str (path)
        path to png images parent directory
    img_size: int, optional
        image size (single value assigned to width and height)
    norm : int, optional
        apply normalization step (1=True, 0=False), by default 0
    v: float, optional
        validation set ratio for evaluating model, by default 0.85
    model_name : str, optional
        custom name to assign to model, by default "ensembleSVM"
    params : dict, optional
        custom training hyperparameters dictionary, by default None
    output_path : str (path), optional
        custom path for saving model, results, by default None (current working directory)

    Returns
    -------
    builder.networks.Ensemble, analyzer.compute.BinaryCompute, analyzer.compute.BinaryCompute
        ensemble builder object, binary compute object, validation compute object
    """
    tv_idx, XTR, YTR, XTS, YTS, XVL, YVL = load_ensemble_data(
        data_file, img_path, img_size=img_size, norm=norm, v=v, output_path=output_path
    )
    ens = train_ensemble(
        XTR,
        YTR,
        XTS,
        YTS,
        model_name=model_name,
        params=params,
        output_path=output_path,
        keras=keras,
    )
    com, val = compute_results(ens, tv_idx, val_set=(XVL, YVL), output_path=output_path)
    return ens, com, val



if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        prog="spacekit",
        usage="python -m spacekit.skopes.hst.svm.train svm_train.csv path/to/img",
    )
    parser.add_argument("data_file", type=str, help="path to training data csv file(s)")
    parser.add_argument("img_path", type=str, help="path to png images parent directory")
    parser.add_argument(
        "-s",
        "--image_size",
        type=int,
        default=128,
        help="image pixel size (single value assigned to width and height)",
    )
    parser.add_argument("-m", "--model_name", type=str, default="ensembleSVM", help="name to give model")
    parser.add_argument(
        "-o",
        "--output_path",
        type=str,
        default=None,
        help="custom path for saving model, results, by default None (current working directory)",
    )
    parser.add_argument(
        "-n",
        "--normalize",
        type=str,
        default=0,
        help="apply normalization and scaling to regression test data",
    )
    parser.add_argument("-b", "--batchsize", type=int, default=32, help="batch_size")
    parser.add_argument("-e", "--epochs", type=int, default=60, help="number of epochs")
    parser.add_argument(
        "-y",
        "--early_stopping",
        type=str,
        default=None,
        choices=["val_accuracy", "val_loss"],
        help="early stopping",
    )
    parser.add_argument(
        "-v",
        "--validate",
        type=int,
        default=1,
        help="evaluate model with validation sample",
    )
    parser.add_argument("-p", "--plots", type=int, default=0, help="draw model performance plots")
    args = parser.parse_args()
    if args.validate == 1:
        v = 0.85
    else:
        v = 0
    model_name = args.model_name
    timestamp = str(int(dt.datetime.now().timestamp()))
    if args.output_path is None:
        output_path = os.path.join(os.getcwd(), f"mml_{timestamp}")
    else:
        output_path = args.output_path
    # SET MODEL FIT PARAMS
    params = dict(
        batch_size=args.batchsize,
        epochs=args.epochs,
        lr=1e-4,
        decay=[100000, 0.96],
        early_stopping=args.early_stopping,
        verbose=args.verbose,
        ensemble=True,
    )
    ens, com, val = run_training(
        args.data_file,
        args.img_path,
        img_size=args.image_size,
        norm=args.normalize,
        v=v,
        model_name=args.model_name,
        params=params,
        output_path=output_path,
    )
    if args.plots is True:
        com.draw_plots()
        val.draw_plots()