# Copyright 2019-2020 The Lux Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import lux
import pandas as pd
import math
import numpy as np
from lux.vis.VisList import VisList
[docs]def similar_pattern(ldf, intent, topK=-1):
"""
Generates visualizations with similar patterns to a query visualization.
Parameters
----------
ldf : lux.core.frame
LuxDataFrame with underspecified intent.
intent: list[lux.Clause]
intent for specifying the visual query for the similarity search.
topK: int
number of visual recommendations to return.
Returns
-------
recommendations : Dict[str,obj]
object with a collection of visualizations that result from the Similarity action
"""
row_specs = list(filter(lambda x: x.value != "", intent))
if len(row_specs) == 1:
search_space_vc = VisList(ldf.current_vis.collection.copy(), ldf)
query_vc = VisList(intent, ldf)
query_vis = query_vc[0]
preprocess(query_vis)
# for loop to create assign euclidean distance
recommendation = {
"action": "Similarity",
"description": "Show other charts that are visually similar to the Current vis.",
}
for vis in search_space_vc:
preprocess(vis)
vis.score = euclidean_dist(query_vis, vis)
search_space_vc.normalize_score(invert_order=True)
if topK != -1:
search_space_vc = search_space_vc.topK(topK)
recommendation["collection"] = search_space_vc
return recommendation
else:
print("Query needs to have 1 row value")
[docs]def aggregate(vis):
"""
Aggregates data values on the y axis so that the vis is a time series
Parameters
----------
vis : lux.vis.Vis
vis that represents the candidate visualization
Returns
-------
None
"""
if vis.get_attr_by_channel("x") and vis.get_attr_by_channel("y"):
xAxis = vis.get_attr_by_channel("x")[0].attribute
yAxis = vis.get_attr_by_channel("y")[0].attribute
vis.data = vis.data[[xAxis, yAxis]].groupby(xAxis, as_index=False).agg({yAxis: "mean"}).copy()
[docs]def interpolate(vis, length):
"""
Interpolates the vis data so that the number of data points is fixed to a constant
Parameters
----------
vis : lux.vis.Vis
vis that represents the candidate visualization
length : int
number of points a vis should have
Returns
-------
None
"""
if vis.get_attr_by_channel("x") and vis.get_attr_by_channel("y"):
xAxis = vis.get_attr_by_channel("x")[0].attribute
yAxis = vis.get_attr_by_channel("y")[0].attribute
if xAxis and yAxis:
yVals = vis.data[yAxis]
xVals = vis.data[xAxis]
n = length
interpolated_x_vals = [0.0] * (length)
interpolated_y_vals = [0.0] * (length)
granularity = (xVals[len(xVals) - 1] - xVals[0]) / n
count = 0
for i in range(0, n):
interpolated_x = xVals[0] + i * granularity
interpolated_x_vals[i] = interpolated_x
while xVals[count] < interpolated_x:
if count < len(xVals):
count += 1
if xVals[count] == interpolated_x:
interpolated_y_vals[i] = yVals[count]
else:
x_diff = xVals[count] - xVals[count - 1]
yDiff = yVals[count] - yVals[count - 1]
interpolated_y_vals[i] = (
yVals[count - 1] + (interpolated_x - xVals[count - 1]) / x_diff * yDiff
)
vis.data = pd.DataFrame(
list(zip(interpolated_x_vals, interpolated_y_vals)),
columns=[xAxis, yAxis],
)
# interpolate dataset
[docs]def normalize(vis):
"""
Normalizes the vis data so that the range of values is 0 to 1 for the vis
Parameters
----------
vis : lux.vis.Vis
vis that represents the candidate visualization
Returns
-------
None
"""
if vis.get_attr_by_channel("y"):
y_axis = vis.get_attr_by_channel("y")[0].attribute
max = vis.data[y_axis].max()
min = vis.data[y_axis].min()
if max == min or (max - min < 1):
return
vis.data[y_axis] = (vis.data[y_axis] - min) / (max - min)
[docs]def euclidean_dist(query_vis, vis):
"""
Calculates euclidean distance score for similarity between two visualizations
Parameters
----------
query_vis : lux.vis.Vis
vis that represents the query pattern
vis : lux.vis.Vis
vis that represents the candidate visualization
Returns
-------
score : float
euclidean distance score
"""
if query_vis.get_attr_by_channel("y") and vis.get_attr_by_channel("y"):
vis_y_axis = vis.get_attr_by_channel("y")[0].attribute
query_y_axis = query_vis.get_attr_by_channel("y")[0].attribute
vis_vector = vis.data[vis_y_axis].values
query_vector = query_vis.data[query_y_axis].values
score = np.linalg.norm(vis_vector - query_vector)
return score
else:
print("no y axis detected")
return 0
[docs]def preprocess(vis):
"""
Processes vis data to allow similarity comparisons between visualizations
Parameters
----------
vis : lux.vis.Vis
vis that represents the candidate visualization
Returns
-------
None
"""
aggregate(vis)
interpolate(vis, 100)
normalize(vis)