Source code for spacekit.skopes.hst.svm.predict

"""
This module generates predictions using a pre-trained ensemble neural network for unlabeled SVM regression test data and
alignment images. The ensemble model is a combination of two neural networks: a MultiLayerPerceptron (for regression test data)
and a 3D Image Convolutional Neural Network (CNN). The script includes functions for the following steps:

1. load and prep the data and images for ML
2. load the saved model and generate predictions
3. save predictions and summary statistics to disk

This script (and/or its functions) should be used in conjunction with spacekit.skopes.hst.svm.prep if using raw data (since both
the regression test dataframe for MLP and the png images for the CNN need to be created first). Once a model has been trained
using the spacekit.skopes.hst.svm.train script, it is saved to disk and can be loaded for use here to generate predictions on
unlabeled data.
"""

# from zipfile import ZipFile
# import tensorflow as tf
import numpy as np
import pandas as pd
import argparse
import os
import sys
import datetime as dt
from spacekit.extractor.load import load_datasets, SVMImageIO
from spacekit.preprocessor.transform import PowerX
from spacekit.builder.architect import BuilderEnsemble

# from spacekit.builder.blueprints import Blueprint

DIM = 3
CH = 3
SIZE = 128
DEPTH = DIM * CH
SHAPE = (DIM, SIZE, SIZE, CH)

TF_CPP_MIN_LOG_LEVEL = 2

DETECTOR_KEY = {"hrc": 0, "ir": 1, "sbc": 2, "uvis": 3, "wfc": 4}



[docs]
def load_mixed_inputs(data_file, img_path, tx=None, size=128, norm=0):
    """Load the regression test data and image input data, then stacks the arrays into a single combined input (list) for the
    ensemble model.

    Parameters
    ----------
    data_file : str
        path to preprocessed mosaic data csv file
    img_path : str
        path to png images parent directory
    size : int, optional
        image size (width and height), by default 128

    Returns
    -------
    list
        regression test data (MLP inputs) and image data (CNN inputs) joined as a list
    """
    cols = [
        "numexp",
        "rms_ra",
        "rms_dec",
        "nmatches",
        "point",
        "segment",
        "gaia",
        "det",
        "wcs",
        "cat",
    ]
    X_data = load_datasets([data_file], column_order=cols)

    print("Loading images into arrays...")
    idx, X_img = SVMImageIO(img_path, w=size, h=size, d=9, data=X_data).load()
    if norm:
        if tx is None:
            Tx = PowerX(X_data, cols=cols[:-3], rename=None)
        else:
            Tx = PowerX(X_data, cols=cols[:-3], tx_file=tx, rename=None)
        X_data = Tx.Xt
        X_img /= 255.0

    diff = X_data.shape[0] - X_img.shape[0]
    if diff > 0:
        X_data = X_data.loc[X_data.index.isin(idx)]
        print(f"{diff} missing images removed from index")
        print(f"X_data: {X_data.shape}\nX_img:  {X_img.shape}")
    print("Joining regression data and image arrays")
    return [X_data, X_img]




[docs]
def classification_report(df, output_path, group=None):
    """Generates a scikit learn classification report with model evaluation metrics and saves to disk.

    Parameters
    ----------
    df : Pandas dataframe
        Feature inputs for which the model will generate predictions.
    output_path : str
        location to store prediction output files
    group: str, optional
        Name for this group of data (for classification report), e.g. SVM-2021-11-02
    """
    P, T = df["y_pred"], df["det"].value_counts()
    C = df.loc[P == 1.0]
    cmp = C["det"].value_counts()
    separator = "---" * 7
    date, time = dt.datetime.now().isoformat().split(".")[0].split("T")
    if group is None:
        group = "SVM-DATA"
    out = sys.stdout
    with open(f"{output_path}/clf_report.txt", "w") as f:
        sys.stdout = f
        print("CLASSIFICATION REPORT")
        print("date: ", date)
        print("time: ", time)
        print("data: ", group)
        print(separator)
        print("Mean Probability Score: ", np.round(df["y_proba"].mean(), 4))
        print("Standard Deviation: ", np.round(df["y_proba"].std(), 4))
        print(separator)
        print("Alignment Evaluation")
        print("0.0=aligned, 1.0=suspicious")
        cnt_pct = pd.concat(
            [
                P.value_counts(),
                P.value_counts(normalize=True),
            ],
            axis=1,
            keys=["cnt", "pct"],
        )
        print(cnt_pct)
        print(separator)
        print("Misalignment counts by Detector")
        for d, i in DETECTOR_KEY.items():
            if i in cmp:
                # some alignments from this detector were suspicious
                print(f"{d}\t{cmp[i]} \t ({T[i]}) \t {np.round((cmp[i] / T[i]) * 100, 1)}%")
            elif i in T:
                # no alignments from this detector were suspicious
                print(f"{d}\t0 \t ({T[i]}) \t 0%")
            else:
                # no samples from this detector in dataset
                print(f"{d}\t0 \t (0) \t -")
        sys.stdout = out
    print(f"\nClassification Report created: {output_path}/clf_report.txt")
    with open(f"{output_path}/compromised.txt", "w") as file:
        for line in list(C["y_pred"].index):
            file.writelines(f"{line}\n")
    print(f"\nSuspicious/Compromised List created: {output_path}/compromised.txt")




[docs]
def classify_alignments(X, model, output_path=None, group=None):
    """Returns classifier predictions and probability scores

    Parameters
    ----------
    X : numpy array
        input features
    model_path : str, optional
        saved model directory path, by default None
    output_path : str
        location to store prediction output files
    group: str, optional
        Name for this group of data (for classification report), e.g. SVM-2021-11-02

    Returns
    -------
    Pandas dataframe
        prediction values, probability scores for target, merged with original input features
    """
    if output_path is None:
        output_path = os.getcwd()
    output_path = os.path.join(output_path, "predictions")
    os.makedirs(output_path, exist_ok=True)
    y_proba = model.predict(X)
    y_pred = np.round(y_proba[:, 0]).reshape(-1, 1)
    # y_proba = proba[:, 0].reshape(-1, 1)
    preds = np.concatenate([y_pred, y_proba], axis=1)
    pred_proba = pd.DataFrame(preds, index=X[0].index, columns=["y_pred", "y_proba"])
    preds = X[0].join(pred_proba)
    preds["index"] = preds.index
    output_file = f"{output_path}/predictions.csv"
    preds.to_csv(output_file, index=False)
    print("Y_PRED + Probabilities added. Dataframe saved to: ", output_file)
    classification_report(preds, output_path, group=group)
    return preds




[docs]
def predict_alignment(
    data_file,
    img_path,
    model_path=None,
    output_path=None,
    size=128,
    norm=0,
    group=None,
    extract_to="models",
):
    """Main calling function to load the data and model, generate predictions, and save results to disk.

    Parameters
    ----------
    data_file : str
        path to preprocessed mosaic data csv file
    img_path : str
        path to png images parent directory
    model_path : str, optional
        saved model directory path, by default None
    output_path : str, optional
        location to store prediction output files, by default None
    size : int, optional
        image size (width and height), by default None (128)
    group: str, optional
        Name for this group of data (for classification report), e.g. SVM-2021-11-02
    """
    builder = BuilderEnsemble(model_path=model_path)
    builder.load_saved_model(arch="svm_align", extract_to=extract_to, keras_archive=True)
    builder.find_tx_file()
    X = load_mixed_inputs(data_file, img_path, tx=builder.tx_file, size=size, norm=norm)
    preds = classify_alignments(X, builder.model, output_path=output_path, group=group)
    return preds



if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        prog="spacekit",
        usage="spacekit.skopes.hst.svm.predict svm_data.csv img",
    )
    parser.add_argument(
        "data_file",
        type=str,
        default="svm_data.csv",
        help="path to preprocessed mosaic data csv file",
    )
    parser.add_argument("img_path", type=str, help="path to png images parent directory")
    parser.add_argument(
        "-m",
        "--model_path",
        type=str,
        default=None,
        help="saved model path",
    )
    parser.add_argument(
        "-o",
        "--output_path",
        type=str,
        default=None,
        help="location to store prediction output files",
    )
    parser.add_argument(
        "-s",
        "--size",
        type=int,
        default=128,
        help="image size (width and height). Default is 128.",
    )
    parser.add_argument(
        "-n",
        "--normalization",
        type=int,
        default=0,
        help="apply normalization and scaling",
    )
    parser.add_argument(
        "-g",
        "--group",
        type=str,
        default=None,
        help="Name for this group of data (to be included in classification report)",
    )
    args = parser.parse_args()
    _ = predict_alignment(
        args.data_file,
        args.img_path,
        model_path=args.model_path,
        output_path=args.output_path,
        size=args.size,
        norm=args.normalization,
        group=args.group,
    )