feature_mapper
Feature mapper maps attributes to features based on ingested data dictionary by the user.
Expand source code
"""Feature mapper maps attributes to features based on ingested data dictionary by the user.""" import copy import random import re import numpy as np import pandas as pd import plotly.graph_objects as go from sentence_transformers import util from anovos.feature_recommender.featrec_init import ( EmbeddingsTrainFer, camel_case_split, feature_recommendation_prep, get_column_name, model_fer, recommendation_data_prep, ) from anovos.feature_recommender.feature_explorer import ( list_usecase_by_industry, process_industry, process_usecase, ) list_train_fer, df_rec_fer = feature_recommendation_prep() list_embedding_train_fer = EmbeddingsTrainFer(list_train_fer) ( feature_name_column, feature_desc_column, industry_column, usecase_column, ) = get_column_name(df_rec_fer) def feature_mapper( df, name_column=None, desc_column=None, suggested_industry="all", suggested_usecase="all", semantic=True, top_n=2, threshold=0.3, ): """Matches features for users based on their input attributes, and their goal industry and/or use case Parameters ---------- df : DataFrame Input DataFrame - Users' Data dictionary. It is expected to consist of attribute name and/or attribute description name_column : str Input, column name of Attribute Name in Input DataFrame. Default is None. desc_column : str Input, column name of Attribute Description in Input DataFrame. Default is None. suggested_industry : str Input, Industry of interest to the user (if any) to be filtered out. Default is 'all', meaning all Industries available. suggested_usecase : str Input, Usecase of interest to the user (if any) to be filtered out. Default is 'all', meaning all Usecases available. semantic : bool Input semantic - Whether the input needs to go through semantic similarity or not. Default is True. top_n : int Number of features displayed. Default is 2 threshold : float Input threshold value. Default is 0.3 Returns ------- DataFrame Columns are: - Input Attribute Name: Name of the input Attribute - Input Attribute Description: Description of the input Attribute - Matched Feature Name: Name of the matched Feature - Matched Feature Description: Description of the matched Feature - Feature Similarity Score: Semantic similarity score between input Attribute and matched Feature - Industry: Industry name of the matched Feature - Usecase: Usecase name of the matched Feature """ if not isinstance(df, pd.DataFrame): raise TypeError("Invalid input for df") if type(top_n) != int or top_n < 0: raise TypeError("Invalid input for top_n") if top_n > len(list_train_fer): raise TypeError("top_n value is too large") if type(threshold) != float: raise TypeError("Invalid input for threshold") if threshold < 0 or threshold > 1: raise TypeError( "Invalid input for threshold. Threshold value is between 0 and 1" ) list_user, df_user = recommendation_data_prep(df, name_column, desc_column) if suggested_industry != "all" and suggested_industry == "all": suggested_industry = process_industry(suggested_industry, semantic) df_rec_fr = df_rec_fer[df_rec_fer.iloc[:, 2].str.contains(suggested_industry)] list_keep = list(df_rec_fr.index) list_embedding_train_fr = [ list_embedding_train_fer.get.tolist()[x] for x in list_keep ] df_rec_fr = df_rec_fr.reset_index(drop=True) elif suggested_usecase != "all" and suggested_industry == "all": suggested_usecase = process_usecase(suggested_usecase, semantic) df_rec_fr = df_rec_fer[df_rec_fer.iloc[:, 3].str.contains(suggested_usecase)] list_keep = list(df_rec_fr.index) list_embedding_train_fr = [ list_embedding_train_fer.get.tolist()[x] for x in list_keep ] df_rec_fr = df_rec_fr.reset_index(drop=True) elif suggested_usecase != "all" and suggested_industry != "all": suggested_industry = process_industry(suggested_industry, semantic) suggested_usecase = process_usecase(suggested_usecase, semantic) df_rec_fr = df_rec_fer[ df_rec_fer.iloc[:, 2].str.contains(suggested_industry) & df_rec_fer.iloc[:, 3].str.contains(suggested_usecase) ] if len(df_rec_fr) > 0: list_keep = list(df_rec_fr.index) list_embedding_train_fr = [ list_embedding_train_fer.get.tolist()[x] for x in list_keep ] df_rec_fr = df_rec_fr.reset_index(drop=True) else: df_out = pd.DataFrame( columns=[ "Input_Attribute_Name", "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] ) print("Industry/Usecase pair does not exist.") return df_out else: df_rec_fr = df_rec_fer list_embedding_train_fr = list_embedding_train_fer.get if name_column is None: df_out = pd.DataFrame( columns=[ "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] ) elif desc_column is None: df_out = pd.DataFrame( columns=[ "Input_Attribute_Name", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] ) else: df_out = pd.DataFrame( columns=[ "Input_Attribute_Name", "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] ) list_embedding_user = model_fer.model.encode(list_user, convert_to_tensor=True) for i, feature in enumerate(list_user): cos_scores = util.pytorch_cos_sim(list_embedding_user, list_embedding_train_fr)[ i ] top_results = np.argpartition(-cos_scores, range(top_n))[0:top_n] for idx in top_results[0:top_n]: single_score = "%.4f" % (cos_scores[idx]) if name_column is None: if float(single_score) >= threshold: df_append = pd.DataFrame( [ [ df_user[desc_column].iloc[i], df_rec_fr[feature_name_column].iloc[int(idx)], df_rec_fr[feature_desc_column].iloc[int(idx)], "%.4f" % (cos_scores[idx]), df_rec_fr[industry_column].iloc[int(idx)], df_rec_fr[usecase_column].iloc[int(idx)], ] ], columns=[ "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) else: df_append = pd.DataFrame( [ [ df_user[desc_column].iloc[i], "N/A", "N/A", "N/A", "N/A", "N/A", ] ], columns=[ "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) elif desc_column is None: if float(single_score) >= threshold: df_append = pd.DataFrame( [ [ df_user[name_column].iloc[i], df_rec_fr[feature_name_column].iloc[int(idx)], df_rec_fr[feature_desc_column].iloc[int(idx)], "%.4f" % (cos_scores[idx]), df_rec_fr[industry_column].iloc[int(idx)], df_rec_fr[usecase_column].iloc[int(idx)], ] ], columns=[ "Input_Attribute_Name", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) else: df_append = pd.DataFrame( [ [ df_user[name_column].iloc[i], "N/A", "N/A", "N/A", "N/A", "N/A", ] ], columns=[ "Input_Attribute_Name", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) else: if float(single_score) >= threshold: df_append = pd.DataFrame( [ [ df_user[name_column].iloc[i], df_user[desc_column].iloc[i], df_rec_fr[feature_name_column].iloc[int(idx)], df_rec_fr[feature_desc_column].iloc[int(idx)], "%.4f" % (cos_scores[idx]), df_rec_fr[industry_column].iloc[int(idx)], df_rec_fr[usecase_column].iloc[int(idx)], ] ], columns=[ "Input_Attribute_Name", "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) else: df_append = pd.DataFrame( [ [ df_user[name_column].iloc[i], df_user[desc_column].iloc[i], "N/A", "N/A", "N/A", "N/A", "N/A", ] ], columns=[ "Input_Attribute_Name", "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) df_out = pd.concat( [df_out, df_append], ignore_index=True, axis=0, join="outer" ) return df_out def find_attr_by_relevance( df, building_corpus, name_column=None, desc_column=None, threshold=0.3 ): """Provide a comprehensive mapping method from users' input attributes to their own feature corpus, and therefore, help with the process of creating features in cold-start problems Parameters ---------- df : DataFrame Input DataFrame - Users' Data dictionary. It is expected to consist of attribute name and/or attribute description building_corpus : list Input Feature Description name_column : str Input, column name of Attribute Name in Input DataFrame. Default is None. desc_column : str Input, column name of Attribute Description in Input DataFrame. Default is None. threshold : float Input threshold value Default is 0.3 Returns ------- DataFrame Columns are: - Input Feature Desc: Description of the input Feature - Recommended Input Attribute Name: Name of the recommended Feature - Recommended Input Attribute Description: Description of the recommended Feature - Input Attribute Similarity Score: Semantic similarity score between input Attribute and recommended Feature """ if not isinstance(df, pd.DataFrame): raise TypeError("Invalid input for df") if type(building_corpus) != list: raise TypeError("Invalid input for building_corpus") if type(threshold) != float: raise TypeError("Invalid input for building_corpus") if threshold < 0 or threshold > 1: raise TypeError( "Invalid input for threshold. Threshold value is between 0 and 1" ) for i in range(len(building_corpus)): if type(building_corpus[i]) != str: raise TypeError("Invalid input inside building_corpus:", building_corpus[i]) building_corpus[i] = re.sub("[^A-Za-z0-9]+", " ", building_corpus[i]) building_corpus[i] = camel_case_split(building_corpus[i]) building_corpus[i] = building_corpus[i].lower().strip() if name_column is None: df_out = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Description", "Input_Attribute_Similarity_Score", ] ) elif desc_column is None: df_out = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Name", "Input_Attribute_Similarity_Score", ] ) else: df_out = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Name", "Recommended_Input_Attribute_Description", "Input_Attribute_Similarity_Score", ] ) list_user, df_user = recommendation_data_prep(df, name_column, desc_column) list_embedding_user = model_fer.model.encode(list_user, convert_to_tensor=True) list_embedding_building = model_fer.model.encode( building_corpus, convert_to_tensor=True ) for i, feature in enumerate(building_corpus): if name_column is None: df_append = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Description", "Input_Attribute_Similarity_Score", ] ) elif desc_column is None: df_append = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Name", "Input_Attribute_Similarity_Score", ] ) else: df_append = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Name", "Recommended_Input_Attribute_Description", "Input_Attribute_Similarity_Score", ] ) cos_scores = util.pytorch_cos_sim(list_embedding_building, list_embedding_user)[ i ] top_results = np.argpartition(-cos_scores, range(len(list_user)))[ 0 : len(list_user) ] for idx in top_results[0 : len(list_user)]: single_score = "%.4f" % (cos_scores[idx]) if float(single_score) >= threshold: if name_column is None: df_append.loc[len(df_append.index)] = [ feature, df_user[desc_column].iloc[int(idx)], single_score, ] elif desc_column is None: df_append.loc[len(df_append.index)] = [ feature, df_user[name_column].iloc[int(idx)], single_score, ] else: df_append.loc[len(df_append.index)] = [ feature, df_user[name_column].iloc[int(idx)], df_user[desc_column].iloc[int(idx)], single_score, ] if len(df_append) == 0: if name_column is None: df_append.loc[len(df_append.index)] = [feature, "N/A", "N/A"] elif desc_column is None: df_append.loc[len(df_append.index)] = [feature, "N/A", "N/A"] else: df_append.loc[len(df_append.index)] = [feature, "N/A", "N/A", "N/A"] df_out = pd.concat([df_out, df_append], ignore_index=True, axis=0, join="outer") return df_out def sankey_visualization(df, industry_included=False, usecase_included=False): """Visualize Feature Mapper functions through Sankey plots Parameters ---------- df : DataFrame Input DataFrame. This DataFrame needs to be output of feature_mapper or find_attr_by_relevance, or in the same format. industry_included : bool Whether the plot needs to include industry mapping or not. Default is False usecase_included : bool Whether the plot needs to include usecase mapping or not. Default is False Returns ------- A `plotly` graph object. """ fr_proper_col_list = [ "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] attr_proper_col_list = [ "Input_Feature_Description", "Input_Attribute_Similarity_Score", ] if not isinstance(df, pd.DataFrame): raise TypeError("Invalid input for df") if not all(x in list(df.columns) for x in fr_proper_col_list) and not all( x in list(df.columns) for x in attr_proper_col_list ): raise TypeError( "df is not output DataFrame of Feature Recommendation functions" ) if type(industry_included) != bool: raise TypeError("Invalid input for industry_included") if type(usecase_included) != bool: raise TypeError("Invalid input for usecase_included") if "Feature_Similarity_Score" in df.columns: if "Input_Attribute_Name" in df.columns: name_source = "Input_Attribute_Name" else: name_source = "Input_Attribute_Description" name_target = "Matched_Feature_Name" name_score = "Feature_Similarity_Score" else: name_source = "Input_Feature_Description" if "Recommended_Input_Attribute_Name" in df.columns: name_target = "Recommended_Input_Attribute_Name" else: name_target = "Recommended_Input_Attribute_Description" name_score = "Input_Attribute_Similarity_Score" if industry_included or usecase_included: print( "Input is find_attr_by_relevance output DataFrame. There is no suggested Industry and/or Usecase." ) industry_included = False usecase_included = False industry_target = "Industry" usecase_target = "Usecase" df_iter = copy.deepcopy(df) for i in range(len(df_iter)): if str(df_iter[name_score][i]) == "N/A": df = df.drop([i]) df = df.reset_index(drop=True) source = [] target = [] value = [] if not industry_included and not usecase_included: source_list = df[name_source].unique().tolist() target_list = df[name_target].unique().tolist() label = source_list + target_list for i in range(len(df)): source.append(label.index(str(df[name_source][i]))) target.append(label.index(str(df[name_target][i]))) value.append(float(df[name_score][i])) elif not industry_included and usecase_included: source_list = df[name_source].unique().tolist() target_list = df[name_target].unique().tolist() raw_usecase_list = df[usecase_target].unique().tolist() usecase_list = [] for i, item in enumerate(raw_usecase_list): if ", " in raw_usecase_list[i]: raw_usecase_list[i] = raw_usecase_list[i].split(", ") for j, sub_item in enumerate(raw_usecase_list[i]): usecase_list.append(sub_item) else: usecase_list.append(item) label = source_list + target_list + usecase_list for i in range(len(df)): source.append(label.index(str(df[name_source][i]))) target.append(label.index(str(df[name_target][i]))) value.append(float(df[name_score][i])) temp_list = df[usecase_target][i].split(", ") for k, item in enumerate(temp_list): source.append(label.index(str(df[name_target][i]))) target.append(label.index(str(item))) value.append(float(1)) elif industry_included and not usecase_included: source_list = df[name_source].unique().tolist() target_list = df[name_target].unique().tolist() raw_industry_list = df[industry_target].unique().tolist() industry_list = [] for i, item in enumerate(raw_industry_list): if ", " in raw_industry_list[i]: raw_industry_list[i] = raw_industry_list[i].split(", ") for j, sub_item in enumerate(raw_industry_list[i]): industry_list.append(sub_item) else: industry_list.append(item) label = source_list + target_list + industry_list for i in range(len(df)): source.append(label.index(str(df[name_source][i]))) target.append(label.index(str(df[name_target][i]))) value.append(float(df[name_score][i])) temp_list = df[industry_target][i].split(", ") for k, item in enumerate(temp_list): source.append(label.index(str(df[name_target][i]))) target.append(label.index(str(item))) value.append(float(1)) else: source_list = df[name_source].unique().tolist() target_list = df[name_target].unique().tolist() raw_industry_list = df[industry_target].unique().tolist() raw_usecase_list = df[usecase_target].unique().tolist() industry_list = [] for i, item in enumerate(raw_industry_list): if ", " in raw_industry_list[i]: raw_industry_list[i] = raw_industry_list[i].split(", ") for j, sub_item in enumerate(raw_industry_list[i]): industry_list.append(sub_item) else: industry_list.append(item) usecase_list = [] for i, item in enumerate(raw_usecase_list): if ", " in raw_usecase_list[i]: raw_usecase_list[i] = raw_usecase_list[i].split(", ") for j, sub_item in enumerate(raw_usecase_list[i]): usecase_list.append(sub_item) else: usecase_list.append(item) label = source_list + target_list + industry_list + usecase_list for i in range(len(df)): source.append(label.index(str(df[name_source][i]))) target.append(label.index(str(df[name_target][i]))) value.append(float(df[name_score][i])) temp_list_industry = df[industry_target][i].split(", ") temp_list_usecase = df[usecase_target][i].split(", ") for k, item_industry in enumerate(temp_list_industry): source.append(label.index(str(df[name_target][i]))) target.append(label.index(str(item_industry))) value.append(float(1)) for j, item_usecase in enumerate(temp_list_usecase): if ( item_usecase in list_usecase_by_industry(item_industry)[ usecase_column ].tolist() ): source.append(label.index(str(item_industry))) target.append(label.index(str(item_usecase))) value.append(float(1)) line_color = [ "#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)]) for k in range(len(value)) ] label_color = [ "#" + "".join([random.choice("0123456789ABCDEF") for e in range(6)]) for f in range(len(label)) ] fig = go.Figure( data=[ go.Sankey( node=dict( pad=15, thickness=20, line=dict(color=line_color, width=0.5), label=label, color=label_color, ), link=dict(source=source, target=target, value=value), ) ] ) fig.update_layout(title_text="Feature Mapper Sankey Visualization", font_size=10) return fig
Functions
def feature_mapper(df, name_column=None, desc_column=None, suggested_industry='all', suggested_usecase='all', semantic=True, top_n=2, threshold=0.3)
-
Matches features for users based on their input attributes, and their goal industry and/or use case
Parameters
df
:DataFrame
- Input DataFrame - Users' Data dictionary. It is expected to consist of attribute name and/or attribute description
name_column
:str
- Input, column name of Attribute Name in Input DataFrame. Default is None.
desc_column
:str
- Input, column name of Attribute Description in Input DataFrame. Default is None.
suggested_industry
:str
- Input, Industry of interest to the user (if any) to be filtered out. Default is 'all', meaning all Industries available.
suggested_usecase
:str
- Input, Usecase of interest to the user (if any) to be filtered out. Default is 'all', meaning all Usecases available.
semantic
:bool
- Input semantic - Whether the input needs to go through semantic similarity or not. Default is True.
top_n
:int
- Number of features displayed. Default is 2
threshold
:float
- Input threshold value. Default is 0.3
Returns
DataFrame
-
Columns are:
- Input Attribute Name: Name of the input Attribute
- Input Attribute Description: Description of the input Attribute
- Matched Feature Name: Name of the matched Feature
- Matched Feature Description: Description of the matched Feature
- Feature Similarity Score: Semantic similarity score between input Attribute and matched Feature
- Industry: Industry name of the matched Feature
- Usecase: Usecase name of the matched Feature
Expand source code
def feature_mapper( df, name_column=None, desc_column=None, suggested_industry="all", suggested_usecase="all", semantic=True, top_n=2, threshold=0.3, ): """Matches features for users based on their input attributes, and their goal industry and/or use case Parameters ---------- df : DataFrame Input DataFrame - Users' Data dictionary. It is expected to consist of attribute name and/or attribute description name_column : str Input, column name of Attribute Name in Input DataFrame. Default is None. desc_column : str Input, column name of Attribute Description in Input DataFrame. Default is None. suggested_industry : str Input, Industry of interest to the user (if any) to be filtered out. Default is 'all', meaning all Industries available. suggested_usecase : str Input, Usecase of interest to the user (if any) to be filtered out. Default is 'all', meaning all Usecases available. semantic : bool Input semantic - Whether the input needs to go through semantic similarity or not. Default is True. top_n : int Number of features displayed. Default is 2 threshold : float Input threshold value. Default is 0.3 Returns ------- DataFrame Columns are: - Input Attribute Name: Name of the input Attribute - Input Attribute Description: Description of the input Attribute - Matched Feature Name: Name of the matched Feature - Matched Feature Description: Description of the matched Feature - Feature Similarity Score: Semantic similarity score between input Attribute and matched Feature - Industry: Industry name of the matched Feature - Usecase: Usecase name of the matched Feature """ if not isinstance(df, pd.DataFrame): raise TypeError("Invalid input for df") if type(top_n) != int or top_n < 0: raise TypeError("Invalid input for top_n") if top_n > len(list_train_fer): raise TypeError("top_n value is too large") if type(threshold) != float: raise TypeError("Invalid input for threshold") if threshold < 0 or threshold > 1: raise TypeError( "Invalid input for threshold. Threshold value is between 0 and 1" ) list_user, df_user = recommendation_data_prep(df, name_column, desc_column) if suggested_industry != "all" and suggested_industry == "all": suggested_industry = process_industry(suggested_industry, semantic) df_rec_fr = df_rec_fer[df_rec_fer.iloc[:, 2].str.contains(suggested_industry)] list_keep = list(df_rec_fr.index) list_embedding_train_fr = [ list_embedding_train_fer.get.tolist()[x] for x in list_keep ] df_rec_fr = df_rec_fr.reset_index(drop=True) elif suggested_usecase != "all" and suggested_industry == "all": suggested_usecase = process_usecase(suggested_usecase, semantic) df_rec_fr = df_rec_fer[df_rec_fer.iloc[:, 3].str.contains(suggested_usecase)] list_keep = list(df_rec_fr.index) list_embedding_train_fr = [ list_embedding_train_fer.get.tolist()[x] for x in list_keep ] df_rec_fr = df_rec_fr.reset_index(drop=True) elif suggested_usecase != "all" and suggested_industry != "all": suggested_industry = process_industry(suggested_industry, semantic) suggested_usecase = process_usecase(suggested_usecase, semantic) df_rec_fr = df_rec_fer[ df_rec_fer.iloc[:, 2].str.contains(suggested_industry) & df_rec_fer.iloc[:, 3].str.contains(suggested_usecase) ] if len(df_rec_fr) > 0: list_keep = list(df_rec_fr.index) list_embedding_train_fr = [ list_embedding_train_fer.get.tolist()[x] for x in list_keep ] df_rec_fr = df_rec_fr.reset_index(drop=True) else: df_out = pd.DataFrame( columns=[ "Input_Attribute_Name", "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] ) print("Industry/Usecase pair does not exist.") return df_out else: df_rec_fr = df_rec_fer list_embedding_train_fr = list_embedding_train_fer.get if name_column is None: df_out = pd.DataFrame( columns=[ "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] ) elif desc_column is None: df_out = pd.DataFrame( columns=[ "Input_Attribute_Name", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] ) else: df_out = pd.DataFrame( columns=[ "Input_Attribute_Name", "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] ) list_embedding_user = model_fer.model.encode(list_user, convert_to_tensor=True) for i, feature in enumerate(list_user): cos_scores = util.pytorch_cos_sim(list_embedding_user, list_embedding_train_fr)[ i ] top_results = np.argpartition(-cos_scores, range(top_n))[0:top_n] for idx in top_results[0:top_n]: single_score = "%.4f" % (cos_scores[idx]) if name_column is None: if float(single_score) >= threshold: df_append = pd.DataFrame( [ [ df_user[desc_column].iloc[i], df_rec_fr[feature_name_column].iloc[int(idx)], df_rec_fr[feature_desc_column].iloc[int(idx)], "%.4f" % (cos_scores[idx]), df_rec_fr[industry_column].iloc[int(idx)], df_rec_fr[usecase_column].iloc[int(idx)], ] ], columns=[ "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) else: df_append = pd.DataFrame( [ [ df_user[desc_column].iloc[i], "N/A", "N/A", "N/A", "N/A", "N/A", ] ], columns=[ "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) elif desc_column is None: if float(single_score) >= threshold: df_append = pd.DataFrame( [ [ df_user[name_column].iloc[i], df_rec_fr[feature_name_column].iloc[int(idx)], df_rec_fr[feature_desc_column].iloc[int(idx)], "%.4f" % (cos_scores[idx]), df_rec_fr[industry_column].iloc[int(idx)], df_rec_fr[usecase_column].iloc[int(idx)], ] ], columns=[ "Input_Attribute_Name", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) else: df_append = pd.DataFrame( [ [ df_user[name_column].iloc[i], "N/A", "N/A", "N/A", "N/A", "N/A", ] ], columns=[ "Input_Attribute_Name", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) else: if float(single_score) >= threshold: df_append = pd.DataFrame( [ [ df_user[name_column].iloc[i], df_user[desc_column].iloc[i], df_rec_fr[feature_name_column].iloc[int(idx)], df_rec_fr[feature_desc_column].iloc[int(idx)], "%.4f" % (cos_scores[idx]), df_rec_fr[industry_column].iloc[int(idx)], df_rec_fr[usecase_column].iloc[int(idx)], ] ], columns=[ "Input_Attribute_Name", "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) else: df_append = pd.DataFrame( [ [ df_user[name_column].iloc[i], df_user[desc_column].iloc[i], "N/A", "N/A", "N/A", "N/A", "N/A", ] ], columns=[ "Input_Attribute_Name", "Input_Attribute_Description", "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ], ) df_out = pd.concat( [df_out, df_append], ignore_index=True, axis=0, join="outer" ) return df_out
def find_attr_by_relevance(df, building_corpus, name_column=None, desc_column=None, threshold=0.3)
-
Provide a comprehensive mapping method from users' input attributes to their own feature corpus, and therefore, help with the process of creating features in cold-start problems
Parameters
df
:DataFrame
- Input DataFrame - Users' Data dictionary. It is expected to consist of attribute name and/or attribute description
building_corpus
:list
- Input Feature Description
name_column
:str
- Input, column name of Attribute Name in Input DataFrame. Default is None.
desc_column
:str
- Input, column name of Attribute Description in Input DataFrame. Default is None.
threshold
:float
- Input threshold value Default is 0.3
Returns
DataFrame
-
Columns are:
- Input Feature Desc: Description of the input Feature
- Recommended Input Attribute Name: Name of the recommended Feature
- Recommended Input Attribute Description: Description of the recommended Feature
- Input Attribute Similarity Score: Semantic similarity score between input Attribute and recommended Feature
Expand source code
def find_attr_by_relevance( df, building_corpus, name_column=None, desc_column=None, threshold=0.3 ): """Provide a comprehensive mapping method from users' input attributes to their own feature corpus, and therefore, help with the process of creating features in cold-start problems Parameters ---------- df : DataFrame Input DataFrame - Users' Data dictionary. It is expected to consist of attribute name and/or attribute description building_corpus : list Input Feature Description name_column : str Input, column name of Attribute Name in Input DataFrame. Default is None. desc_column : str Input, column name of Attribute Description in Input DataFrame. Default is None. threshold : float Input threshold value Default is 0.3 Returns ------- DataFrame Columns are: - Input Feature Desc: Description of the input Feature - Recommended Input Attribute Name: Name of the recommended Feature - Recommended Input Attribute Description: Description of the recommended Feature - Input Attribute Similarity Score: Semantic similarity score between input Attribute and recommended Feature """ if not isinstance(df, pd.DataFrame): raise TypeError("Invalid input for df") if type(building_corpus) != list: raise TypeError("Invalid input for building_corpus") if type(threshold) != float: raise TypeError("Invalid input for building_corpus") if threshold < 0 or threshold > 1: raise TypeError( "Invalid input for threshold. Threshold value is between 0 and 1" ) for i in range(len(building_corpus)): if type(building_corpus[i]) != str: raise TypeError("Invalid input inside building_corpus:", building_corpus[i]) building_corpus[i] = re.sub("[^A-Za-z0-9]+", " ", building_corpus[i]) building_corpus[i] = camel_case_split(building_corpus[i]) building_corpus[i] = building_corpus[i].lower().strip() if name_column is None: df_out = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Description", "Input_Attribute_Similarity_Score", ] ) elif desc_column is None: df_out = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Name", "Input_Attribute_Similarity_Score", ] ) else: df_out = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Name", "Recommended_Input_Attribute_Description", "Input_Attribute_Similarity_Score", ] ) list_user, df_user = recommendation_data_prep(df, name_column, desc_column) list_embedding_user = model_fer.model.encode(list_user, convert_to_tensor=True) list_embedding_building = model_fer.model.encode( building_corpus, convert_to_tensor=True ) for i, feature in enumerate(building_corpus): if name_column is None: df_append = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Description", "Input_Attribute_Similarity_Score", ] ) elif desc_column is None: df_append = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Name", "Input_Attribute_Similarity_Score", ] ) else: df_append = pd.DataFrame( columns=[ "Input_Feature_Description", "Recommended_Input_Attribute_Name", "Recommended_Input_Attribute_Description", "Input_Attribute_Similarity_Score", ] ) cos_scores = util.pytorch_cos_sim(list_embedding_building, list_embedding_user)[ i ] top_results = np.argpartition(-cos_scores, range(len(list_user)))[ 0 : len(list_user) ] for idx in top_results[0 : len(list_user)]: single_score = "%.4f" % (cos_scores[idx]) if float(single_score) >= threshold: if name_column is None: df_append.loc[len(df_append.index)] = [ feature, df_user[desc_column].iloc[int(idx)], single_score, ] elif desc_column is None: df_append.loc[len(df_append.index)] = [ feature, df_user[name_column].iloc[int(idx)], single_score, ] else: df_append.loc[len(df_append.index)] = [ feature, df_user[name_column].iloc[int(idx)], df_user[desc_column].iloc[int(idx)], single_score, ] if len(df_append) == 0: if name_column is None: df_append.loc[len(df_append.index)] = [feature, "N/A", "N/A"] elif desc_column is None: df_append.loc[len(df_append.index)] = [feature, "N/A", "N/A"] else: df_append.loc[len(df_append.index)] = [feature, "N/A", "N/A", "N/A"] df_out = pd.concat([df_out, df_append], ignore_index=True, axis=0, join="outer") return df_out
def sankey_visualization(df, industry_included=False, usecase_included=False)
-
Visualize Feature Mapper functions through Sankey plots
Parameters
df
:DataFrame
- Input DataFrame. This DataFrame needs to be output of feature_mapper or find_attr_by_relevance, or in the same format.
industry_included
:bool
- Whether the plot needs to include industry mapping or not. Default is False
usecase_included
:bool
- Whether the plot needs to include usecase mapping or not. Default is False
Returns
A
plotly
graph object.Expand source code
def sankey_visualization(df, industry_included=False, usecase_included=False): """Visualize Feature Mapper functions through Sankey plots Parameters ---------- df : DataFrame Input DataFrame. This DataFrame needs to be output of feature_mapper or find_attr_by_relevance, or in the same format. industry_included : bool Whether the plot needs to include industry mapping or not. Default is False usecase_included : bool Whether the plot needs to include usecase mapping or not. Default is False Returns ------- A `plotly` graph object. """ fr_proper_col_list = [ "Matched_Feature_Name", "Matched_Feature_Description", "Feature_Similarity_Score", "Industry", "Usecase", ] attr_proper_col_list = [ "Input_Feature_Description", "Input_Attribute_Similarity_Score", ] if not isinstance(df, pd.DataFrame): raise TypeError("Invalid input for df") if not all(x in list(df.columns) for x in fr_proper_col_list) and not all( x in list(df.columns) for x in attr_proper_col_list ): raise TypeError( "df is not output DataFrame of Feature Recommendation functions" ) if type(industry_included) != bool: raise TypeError("Invalid input for industry_included") if type(usecase_included) != bool: raise TypeError("Invalid input for usecase_included") if "Feature_Similarity_Score" in df.columns: if "Input_Attribute_Name" in df.columns: name_source = "Input_Attribute_Name" else: name_source = "Input_Attribute_Description" name_target = "Matched_Feature_Name" name_score = "Feature_Similarity_Score" else: name_source = "Input_Feature_Description" if "Recommended_Input_Attribute_Name" in df.columns: name_target = "Recommended_Input_Attribute_Name" else: name_target = "Recommended_Input_Attribute_Description" name_score = "Input_Attribute_Similarity_Score" if industry_included or usecase_included: print( "Input is find_attr_by_relevance output DataFrame. There is no suggested Industry and/or Usecase." ) industry_included = False usecase_included = False industry_target = "Industry" usecase_target = "Usecase" df_iter = copy.deepcopy(df) for i in range(len(df_iter)): if str(df_iter[name_score][i]) == "N/A": df = df.drop([i]) df = df.reset_index(drop=True) source = [] target = [] value = [] if not industry_included and not usecase_included: source_list = df[name_source].unique().tolist() target_list = df[name_target].unique().tolist() label = source_list + target_list for i in range(len(df)): source.append(label.index(str(df[name_source][i]))) target.append(label.index(str(df[name_target][i]))) value.append(float(df[name_score][i])) elif not industry_included and usecase_included: source_list = df[name_source].unique().tolist() target_list = df[name_target].unique().tolist() raw_usecase_list = df[usecase_target].unique().tolist() usecase_list = [] for i, item in enumerate(raw_usecase_list): if ", " in raw_usecase_list[i]: raw_usecase_list[i] = raw_usecase_list[i].split(", ") for j, sub_item in enumerate(raw_usecase_list[i]): usecase_list.append(sub_item) else: usecase_list.append(item) label = source_list + target_list + usecase_list for i in range(len(df)): source.append(label.index(str(df[name_source][i]))) target.append(label.index(str(df[name_target][i]))) value.append(float(df[name_score][i])) temp_list = df[usecase_target][i].split(", ") for k, item in enumerate(temp_list): source.append(label.index(str(df[name_target][i]))) target.append(label.index(str(item))) value.append(float(1)) elif industry_included and not usecase_included: source_list = df[name_source].unique().tolist() target_list = df[name_target].unique().tolist() raw_industry_list = df[industry_target].unique().tolist() industry_list = [] for i, item in enumerate(raw_industry_list): if ", " in raw_industry_list[i]: raw_industry_list[i] = raw_industry_list[i].split(", ") for j, sub_item in enumerate(raw_industry_list[i]): industry_list.append(sub_item) else: industry_list.append(item) label = source_list + target_list + industry_list for i in range(len(df)): source.append(label.index(str(df[name_source][i]))) target.append(label.index(str(df[name_target][i]))) value.append(float(df[name_score][i])) temp_list = df[industry_target][i].split(", ") for k, item in enumerate(temp_list): source.append(label.index(str(df[name_target][i]))) target.append(label.index(str(item))) value.append(float(1)) else: source_list = df[name_source].unique().tolist() target_list = df[name_target].unique().tolist() raw_industry_list = df[industry_target].unique().tolist() raw_usecase_list = df[usecase_target].unique().tolist() industry_list = [] for i, item in enumerate(raw_industry_list): if ", " in raw_industry_list[i]: raw_industry_list[i] = raw_industry_list[i].split(", ") for j, sub_item in enumerate(raw_industry_list[i]): industry_list.append(sub_item) else: industry_list.append(item) usecase_list = [] for i, item in enumerate(raw_usecase_list): if ", " in raw_usecase_list[i]: raw_usecase_list[i] = raw_usecase_list[i].split(", ") for j, sub_item in enumerate(raw_usecase_list[i]): usecase_list.append(sub_item) else: usecase_list.append(item) label = source_list + target_list + industry_list + usecase_list for i in range(len(df)): source.append(label.index(str(df[name_source][i]))) target.append(label.index(str(df[name_target][i]))) value.append(float(df[name_score][i])) temp_list_industry = df[industry_target][i].split(", ") temp_list_usecase = df[usecase_target][i].split(", ") for k, item_industry in enumerate(temp_list_industry): source.append(label.index(str(df[name_target][i]))) target.append(label.index(str(item_industry))) value.append(float(1)) for j, item_usecase in enumerate(temp_list_usecase): if ( item_usecase in list_usecase_by_industry(item_industry)[ usecase_column ].tolist() ): source.append(label.index(str(item_industry))) target.append(label.index(str(item_usecase))) value.append(float(1)) line_color = [ "#" + "".join([random.choice("0123456789ABCDEF") for j in range(6)]) for k in range(len(value)) ] label_color = [ "#" + "".join([random.choice("0123456789ABCDEF") for e in range(6)]) for f in range(len(label)) ] fig = go.Figure( data=[ go.Sankey( node=dict( pad=15, thickness=20, line=dict(color=line_color, width=0.5), label=label, color=label_color, ), link=dict(source=source, target=target, value=value), ) ] ) fig.update_layout(title_text="Feature Mapper Sankey Visualization", font_size=10) return fig