Source code for lux.utils.utils

#  Copyright 2019-2020 The Lux Authors.
# 
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
import pandas as pd
[docs]def convert_to_list(x):
	'''
	"a" --> ["a"]
	["a","b"] --> ["a","b"]
	'''
	if type(x) != list:
		return [x]
	else:
		return x

[docs]def pandas_to_lux(df):
	from lux.core.frame import LuxDataFrame
	values = df.values.tolist()
	ldf = LuxDataFrame(values, columns = df.columns)
	return(ldf)

[docs]def get_attrs_specs(intent):
	if (intent is None): return []
	spec_obj = list(filter(lambda x: x.value=="", intent))
	return spec_obj

[docs]def get_filter_specs(intent):
	if (intent is None): return []
	spec_obj = list(filter(lambda x: x.value!="", intent))
	return spec_obj

[docs]def check_import_lux_widget():
	import pkgutil
	if (pkgutil.find_loader("luxwidget") is None):
		raise Exception("luxwidget is not installed. Run `pip install luxwidget' to install the Jupyter widget.\nSee more at: https://github.com/lux-org/lux-widget")

[docs]def get_agg_title(clause):
	if (clause.aggregation is None):
		return f'{clause.attribute}'
	elif (clause.attribute=="Record"):
		return f'Number of Records'
	else:
		return f'{clause._aggregation_name.capitalize()} of {clause.attribute}'
[docs]def check_if_id_like(df,attribute):
	import re
	# Strong signals
	high_cardinality = df.cardinality[attribute]>500 # so that aggregated reset_index fields don't get misclassified
	attribute_contain_id = re.search(r'id',attribute) is not None
	almost_all_vals_unique = df.cardinality[attribute] >=0.98* len(df)
	is_string = pd.api.types.is_string_dtype(df[attribute])
	if (is_string):
		# For string IDs, usually serial numbers or codes with alphanumerics have a consistent length (eg., CG-39405) with little deviation. For a high cardinality string field but not ID field (like Name or Brand), there is less uniformity across the string lengths.
		if (len(df)>50):
			sampled = df[attribute].sample(50,random_state=99)
		else: 
			sampled = df[attribute]
		str_length_uniformity = sampled.apply(lambda x: type(x)==str and len(x)).std() < 3
		return high_cardinality and (attribute_contain_id or almost_all_vals_unique) and str_length_uniformity
	else:
		# TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even)
		return high_cardinality and (attribute_contain_id or almost_all_vals_unique)