Source code for lux.action.similarity

#  Copyright 2019-2020 The Lux Authors.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

import lux
import pandas as pd
import math
import numpy as np
from lux.vis.VisList import VisList


[docs]def similar_pattern(ldf, intent, topK=-1): """ Generates visualizations with similar patterns to a query visualization. Parameters ---------- ldf : lux.core.frame LuxDataFrame with underspecified intent. intent: list[lux.Clause] intent for specifying the visual query for the similarity search. topK: int number of visual recommendations to return. Returns ------- recommendations : Dict[str,obj] object with a collection of visualizations that result from the Similarity action """ row_specs = list(filter(lambda x: x.value != "", intent)) if len(row_specs) == 1: search_space_vc = VisList(ldf.current_vis.collection.copy(), ldf) query_vc = VisList(intent, ldf) query_vis = query_vc[0] preprocess(query_vis) # for loop to create assign euclidean distance recommendation = { "action": "Similarity", "description": "Show other charts that are visually similar to the Current vis.", } for vis in search_space_vc: preprocess(vis) vis.score = euclidean_dist(query_vis, vis) search_space_vc.normalize_score(invert_order=True) if topK != -1: search_space_vc = search_space_vc.topK(topK) recommendation["collection"] = search_space_vc return recommendation else: print("Query needs to have 1 row value")
[docs]def aggregate(vis): """ Aggregates data values on the y axis so that the vis is a time series Parameters ---------- vis : lux.vis.Vis vis that represents the candidate visualization Returns ------- None """ if vis.get_attr_by_channel("x") and vis.get_attr_by_channel("y"): xAxis = vis.get_attr_by_channel("x")[0].attribute yAxis = vis.get_attr_by_channel("y")[0].attribute vis.data = vis.data[[xAxis, yAxis]].groupby(xAxis, as_index=False).agg({yAxis: "mean"}).copy()
[docs]def interpolate(vis, length): """ Interpolates the vis data so that the number of data points is fixed to a constant Parameters ---------- vis : lux.vis.Vis vis that represents the candidate visualization length : int number of points a vis should have Returns ------- None """ if vis.get_attr_by_channel("x") and vis.get_attr_by_channel("y"): xAxis = vis.get_attr_by_channel("x")[0].attribute yAxis = vis.get_attr_by_channel("y")[0].attribute if xAxis and yAxis: yVals = vis.data[yAxis] xVals = vis.data[xAxis] n = length interpolated_x_vals = [0.0] * (length) interpolated_y_vals = [0.0] * (length) granularity = (xVals[len(xVals) - 1] - xVals[0]) / n count = 0 for i in range(0, n): interpolated_x = xVals[0] + i * granularity interpolated_x_vals[i] = interpolated_x while xVals[count] < interpolated_x: if count < len(xVals): count += 1 if xVals[count] == interpolated_x: interpolated_y_vals[i] = yVals[count] else: x_diff = xVals[count] - xVals[count - 1] yDiff = yVals[count] - yVals[count - 1] interpolated_y_vals[i] = ( yVals[count - 1] + (interpolated_x - xVals[count - 1]) / x_diff * yDiff ) vis.data = pd.DataFrame( list(zip(interpolated_x_vals, interpolated_y_vals)), columns=[xAxis, yAxis], )
# interpolate dataset
[docs]def normalize(vis): """ Normalizes the vis data so that the range of values is 0 to 1 for the vis Parameters ---------- vis : lux.vis.Vis vis that represents the candidate visualization Returns ------- None """ if vis.get_attr_by_channel("y"): y_axis = vis.get_attr_by_channel("y")[0].attribute max = vis.data[y_axis].max() min = vis.data[y_axis].min() if max == min or (max - min < 1): return vis.data[y_axis] = (vis.data[y_axis] - min) / (max - min)
[docs]def euclidean_dist(query_vis, vis): """ Calculates euclidean distance score for similarity between two visualizations Parameters ---------- query_vis : lux.vis.Vis vis that represents the query pattern vis : lux.vis.Vis vis that represents the candidate visualization Returns ------- score : float euclidean distance score """ if query_vis.get_attr_by_channel("y") and vis.get_attr_by_channel("y"): vis_y_axis = vis.get_attr_by_channel("y")[0].attribute query_y_axis = query_vis.get_attr_by_channel("y")[0].attribute vis_vector = vis.data[vis_y_axis].values query_vector = query_vis.data[query_y_axis].values score = np.linalg.norm(vis_vector - query_vector) return score else: print("no y axis detected") return 0
[docs]def preprocess(vis): """ Processes vis data to allow similarity comparisons between visualizations Parameters ---------- vis : lux.vis.Vis vis that represents the candidate visualization Returns ------- None """ aggregate(vis) interpolate(vis, 100) normalize(vis)