根据您的描述,您必须阅读每个文件的第一行才能按标识符组织它们。我认为这样的事情会做你正在寻找的东西:
import os
import collections
import random
import shutil
def get_identifier(path):
with open(path) as fd:
return fd.readline().strip() #assuming you don't want the \n in the identifier
paths = ['/home/file1', '/home/file2', '/home/file3']
destination_dir = '/tmp'
identifiers = collections.defaultdict(list)
for path in paths:
identifier = get_identifier(path)
identifiers[identifier].append(path)
for identifier, paths in identifiers.items():
sample = random.sample(paths, 500)
for path in sample:
file_name = os.path.basename(path)
destination = os.path.join(destination_dir, file_name)
shutil.copy(path, destination)