Source code for lux.action.correlation

#  Copyright 2019-2020 The Lux Authors.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import lux
from lux.interestingness.interestingness import interestingness
from lux.processor.Compiler import Compiler
from lux.core.frame import LuxDataFrame
from lux.vis.VisList import VisList
from lux.utils import utils


# change ignore_transpose to false for now.
[docs]def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True):
    """
    Generates bivariate visualizations that represent all pairwise relationships in the data.

    Parameters
    ----------
    ldf : LuxDataFrame
            LuxDataFrame with underspecified intent.

    ignore_transpose: bool
            Boolean flag to ignore pairs of attributes whose transpose are already computed (i.e., {X,Y} will be ignored if {Y,X} is already computed)

    Returns
    -------
    recommendations : Dict[str,obj]
            object with a collection of visualizations that result from the Correlation action.
    """

    import numpy as np

    filter_specs = utils.get_filter_specs(ldf._intent)
    intent = [
        lux.Clause("?", data_model="measure"),
        lux.Clause("?", data_model="measure"),
    ]
    intent.extend(filter_specs)
    vlist = VisList(intent, ldf)
    examples = ""
    if len(vlist) > 1:
        measures = vlist[0].get_attr_by_data_model("measure")
        if len(measures) >= 2:
            examples = f" (e.g., {measures[0].attribute}, {measures[1].attribute})"
    recommendation = {
        "action": "Correlation",
        "description": "Show relationships between two <p class='highlight-descriptor'>quantitative</p> attributes.",
        "long_description": f"Correlation searches through all pairwise relationship between two quantitative attributes\
            {examples}. The visualizations are ranked from most to least linearly correlated based on \
                their Pearson’s correlation score.",
    }
    ignore_rec_flag = False
    # Doesn't make sense to compute correlation if less than 4 data values
    if len(ldf) < 5:
        ignore_rec_flag = True
    # Then use the data populated in the vis list to compute score
    for vis in vlist:
        measures = vis.get_attr_by_data_model("measure")
        if len(measures) < 2:
            raise ValueError(
                f"Can not compute correlation between {[x.attribute for x in ldf.columns]} since less than 2 measure values present."
            )
        msr1 = measures[0].attribute
        msr2 = measures[1].attribute
        if ignore_transpose:
            check_transpose = check_transpose_not_computed(vlist, msr1, msr2)
        else:
            check_transpose = True
        if check_transpose:
            vis.score = interestingness(vis, ldf)
        else:
            vis.score = -1
    if ignore_rec_flag:
        recommendation["collection"] = []
        return recommendation
    vlist.sort()
    vlist = vlist.showK()
    recommendation["collection"] = vlist
    return recommendation


[docs]def check_transpose_not_computed(vlist: VisList, a: str, b: str):
    transpose_exist = list(
        filter(
            lambda x: (x._inferred_intent[0].attribute == b) and (x._inferred_intent[1].attribute == a),
            vlist,
        )
    )
    if len(transpose_exist) > 0:
        return transpose_exist[0].score == -1
    else:
        return False