Here is a solution that worked for me using python:
categories = ["EE-Male", "EE-Female"]
#Create a dictionary with categories as keys and a regular expression as values.
categories_regex = {}
for category in categories:
categories_regex[category] = ".*" + ".*".join(list(category)) + ".*"
df['type'] = df.apply(lambda row : clean_categorical_var(row['type'], categories, categories_regex), axis = 1)
df['name'] = df.apply(lambda row : clean_name_var(row, 'type', 'name', categories, 'type2'), axis = 1)
df.drop(labels=["type"], axis=1, inplace = True)
df.rename(columns={"type2":"type"}, inplace = True)
And I used the following three auxiliary functions:
def clean_categorical_var(categorical_cell, categories, categories_regex):
'''
Cleans a categorical variable cell such as the type variable.
Input:
categorical_cell (str): content of the categorical cell tu clean
categories (list): list with the values (str) supposed to find on the
categorical column (ex. EE-Male, EE-Female)
categories_regex (dic): categories as keys and a regular expression for
each category as values.
Output:
cleaned_category (str): cleaned category without the mixed letters.
'''
cleaned_category = np.nan
for category in categories:
regex = categories_regex[category]
if re.match(regex, categorical_cell):
cleaned_category = category
return cleaned_category
def remove_letters(category, string_to_clean):
'''
Removes the letters on the category to recover the letters missing on the previous column.
Input:
categories (list): list with the values (str) supposed to find on the
categorical column (ex. EE-Male, EE-Female)
string_to_clean (str): categorical column dirty from where to recover the missing letters
Output:
cleaned_name (str): cleaned name with the letters that were missing at the end.
'''
category = list(category)
letters_index_to_delete = []
for n, letter in enumerate(list(string_to_clean)):
if letter == category[0]:
letters_index_to_delete.append(n)
del category[0]
if not category:
break
return letters_index_to_delete
def clean_name_var(row, categorical_column, name_column, categories,
categorical_column2='categorical_column_cleaned'):
'''
Cleans a name variable adding the letters that were missing at the end.
Input:
row (df.row): The row from the df to be cleaned
categorical_column (str): name of the column with the categories (ex. type)
name_column (str): name of the column to be cleaned
categories (list): list with the values (str) supposed to find on the
categorical column (ex. EE-Male, EE-Female)
categorical_column2 (str): name of the column with the categories cleaned (ex. type)
Output:
cleaned_name (str): cleaned name with the letters that were missing at the end.
'''
letters_index_to_delete = []
col_name_end = list(row[categorical_column])
if row[categorical_column] in categories:
return row[name_column]
for category in categories:
if row[categorical_column2] == category:
letters_index_to_delete = remove_letters(category, row[categorical_column])
break
for n in sorted(letters_index_to_delete, reverse=True):
del col_name_end[n]
return row[name_column]+''.join(col_name_end)