谢谢你的贡献。我不得不稍微修改一下你的代码,以纠正一些缺失的东西并适应 python3。主要变化是
- 导入字符串(它丢失了)
- 将 dictionary.iteritems 更改为 dictionary.items (python3)
- 将所有 print "..." 转换为 print("...") (python3)
- 变量 group_names 丢失
- 强制 GroupName 成为 group_name 循环中的 str
- 将最终字典排序为 dict2
您的原始数据位于现在名为 input2.csv 的 csv 文件中
,group1,group2,meandiff,lower,upper,reject
0,101,102,0.2917,-0.0425,0.6259,False
1,101,103,0.1571,-0.1649,0.4792,False
2,101,104,-0.1333,-0.4675,0.2009,False
3,101,105,0.0833,-0.2509,0.4175,False
4,101,106,-0.0500,-0.3626,0.2626,False
5,102,103,-0.1345,-0.4566,0.1875,False
6,102,104,-0.4250,-0.7592,-0.0908,True
7,102,105,-0.2083,-0.5425,0.1259,False
8,102,106,-0.3417,-0.6543,-0.0290,True
9,103,104,-0.2905,-0.6125,0.0316,False
10,103,105,-0.0738,-0.3959,0.2482,False
11,103,106,-0.2071,-0.5067,0.0924,False
12,104,105,0.2167,-0.1175,0.5509,False
13,104,106,0.0833,-0.2293,0.3960,False
14,105,106,-0.1333,-0.4460,0.1793,False
import pandas as pd
import numpy as np
import math
import itertools
import string
df = pd.read_csv('input2.csv', index_col=0)
df_True = df.loc[df.reject==True,:]
letters = list(string.ascii_lowercase)
n = 0
group1_list = df_True.group1.tolist() #get the groups from the df with only True (True df) to a list
group2_list = df_True.group2.tolist()
group3 = group1_list+group2_list #concat both lists
group4 = list(set(group3)) #get unique items from the list
group5 = [str(i) for i in group4 ] #convert unicode to a str
group5.sort() #sort the list
gen = ((i, 0) for i in group5) #create dict with 0 so the dict won't be empty when starts
dictionary = dict(gen)
group6 = [(group5[i],group5[j]) for i in range(len(group5)) for j in range(i+1, len(group5))] #get all combination pairs
for pairs in group6: #check for each combination if it is present in df_True
print(n)
print(dictionary)
try:
a = df_True.loc[(df_True.group1==pairs[0])&(df_True.group2==pairs[1]),:] #check if the pair exists in the df
except:
a.shape[0] == 0
if a.shape[0] == 0: #it mean that the df is empty as it does not appear in df_True so this pair is equal
print ('equal')
if dictionary[pairs[0]] != 0 and dictionary[pairs[1]] == 0: #if the 1st is populated but the 2nd in not populated
print ("1st is populated and 2nd is empty")
dictionary[pairs[1]] = dictionary[pairs[0]]
elif dictionary[pairs[0]] != 0 and dictionary[pairs[1]] != 0: #if both are populated, check matching labeles
print ("both are populated")
if len(list(set([c for c in dictionary[pairs[0]] if c in dictionary[pairs[1]]]))) >0: #check if they have a common label
print ("they have a shared character")
else:
print ("equal but have different labels")
#check if the 1st group label doesn't appear in anyother labels, if it is unique then the 2nd group can have the first group label
m = 0 #count the number of groups that have a shared char with 1st group
j = 0 #count the number of groups that have a shared char with 2nd group
for key, value in dictionary.items():
if key != pairs[0] and len(list(set([c for c in dictionary[pairs[0]] if c in value])))==0:
m+=1
for key, value in dictionary.items():
if key != pairs[1] and len(list(set([c for c in dictionary[pairs[1]] if c in value])))==0:
j+=1
if m == len(dictionary)-1 and j == len(dictionary)-1: #it means that this value is unique because it has no shared char with another group
print ("unique")
dictionary[pairs[1]] = dictionary[pairs[0]][0]
else:
print ("there is at least one group in the dict that shares a char with the 1st group")
dictionary[pairs[1]] = dictionary[pairs[1]] + dictionary[pairs[0]][0]
else: # if it equals 0, meaning if the 1st is empty (which means that the 2nd must be also empty)
print ("both are empty")
dictionary[pairs[0]] = letters[n]
dictionary[pairs[1]] = letters[n]
else:
print ("not equal")
if dictionary[pairs[0]] != 0: # if the first one is populated (has a value) then give a value only to the second
print ('1st is populated')
# if the 2nd is not empty and they don't share a charcter then no change is needed as they already have different labels
if dictionary[pairs[1]] != 0 and len(list(set([c for c in dictionary[pairs[0]] if c in dictionary[pairs[1]]]))) == 0:
print ("no change")
elif dictionary[pairs[1]] == 0: #if the 2nd is not populated give it a new letter
dictionary[pairs[1]] = letters[n+1]
#if the 2nd is populated and equal to the 1st, then change the letter of the 2nd to a new one and assign its original letter to all the others that had the same original letter
elif dictionary[pairs[1]] != 0 and len(list(set([c for c in dictionary[pairs[0]] if c in dictionary[pairs[1]]]))) > 0:
#need to check that they don't share a charcter
print ("need to add a letter")
original_value = dictionary[pairs[1]]
dictionary[pairs[1]] = letters[n]
for key, value in dictionary.items():
if key != pairs[0] and len(list(set([c for c in original_value if c in value])))>0: #for any given value, check if it had a character from the group that will get a new letter, if so, it means that they are equal and thus the new letter should also appear in the value of the "old" group
dictionary[key] = original_value + letters[n] #add the original letter of the group to all the other groups it was similar to
else:
print ('1st is empty')
dictionary[pairs[0]] = letters[n]
dictionary[pairs[1]] = letters[n+1]
print (dictionary)
n+=1
# get the letter out the dictionary
labels = list(dictionary.values())
labels1 = list(set(labels))
labels1.sort()
final_label = ''.join(labels1)
df2=pd.concat([df.group1,df.group2])
group_names=df2.unique()
for GroupName in group_names:
if GroupName in dictionary:
print ("already exists")
else:
dictionary[str(GroupName)] = final_label
for key, value in dictionary.items(): #this keeps only the unique char per group and sort it by group
dictionary[key] = ''.join(set(value))
dict2 = dict(sorted(dictionary.items())) # the final output