-1

我有一个看起来像这样的输入文件:

eya XLOC_000445_Change:10.3_q:0.003 10  atonal1
six XLOC_00099_Change:70.0_q:0.095  30  atonal1
six-eya XLOC_0234324_Change:19.8_q:0.05 40  atonal1
eya XLOC_00010_Change:6.5_q:0.22    41  c-myc
six XLOC_025437_Change:1.1_q:0.018  22  c-myc
six-eya XLOC_001045_Change:2.3_q:0.0001 12  c-myc
control XLOC_000230_Change:9.4_q:0.003  43  cyclinD1
six XLOC_025437_Change:1.0_q:0.0008 54  cyclinD1
six-eya XLOC_001000_Change:4.3_q:0.02   34  cyclinD1
eya XLOC_000445_Change:9.9_q:0.3    42  atonal1
six XLOC_00099_Change:7.0_q:0.95    64  atonal1
control XLOC_0234324_Change:19.8_q:0.5  1   atonal1
six-eya XLOC_091345_Change:9.3_q:0.005  24  atonal1
eya XLOC_000115_Change:7.3_q:0.03   66  ezrin
six XLOC_000001_Change:7.9_q:0.00006    20  ezrin
six-eya XLOC_0234322_Change:9.0_q:0.0225    21  ezrin
six-eya XLOC_091345_Change:9.3_q:0.005  24  slc12a2
control XLOC_000440_Change:1.1_q:0.0012 26  sox2
eya XLOC_00010_Change:2.3_q:0.0002  65  sox2
six XLOC_000347_Change:8.3_q:0.5    76  sox2

以下子例程将提取每一行的信息并将其存储在 HoHoA 中:$hash{$gene}{$condition} = [$xloc, $q_value, $change, $percent_id ]

它还通过计算重复的数量并在基因名称前加上 [duplicate_number] 来满足基因多次出现的情况(例如基因'atonal1')。这意味着即使列表中重复的基因也可以输入哈希,因为它们是唯一的:

sub blast_extractor {
    my $file = shift;
    open my $output, '<', $file or die "Can't read file '$file' [$!]\n";
    while (<$output>) { 
        chomp;
        my $xloc = $1 if ($_ =~ /(XLOC_\d+)/);
        my $change = $1 if ($_ =~ /Change:(-?\d+\.\d+|-?inf)/);
        my $q_value = $1 if ($_ =~ /q:(\d+\.\d+)/);
        my @split = split('\t');
        my $condition = $split[0];
        my $percent_id = $split[2];
        my $gene = $split[3];
        my ($count) = $duplicates{$gene}{$condition}++;
        unless ($count == 0){ $gene = "[$count]$gene" foreach $count};
            if (not exists $hash{$gene}{$condition} ) 
            {
                $hash{$gene}{$condition} = [$xloc, $q_value, $change, $percent_id ];
                $new{$gene}{$condition} = $change;
            }
    }
    return \%hash;
}

print Dumper \%hash;
$VAR1 = {
    '[1]atonal1' => {
        'eya' => [ 'XLOC_000445', '0.3', '9.9', '42' ],
        'six' => [ 'XLOC_00099', '0.95', '7.0', '64' ],
        'six-eya' => [ 'XLOC_091345', '0.005', '9.3', '24' ]
    },
    'atonal1' => {
         'control' => [ 'XLOC_0234324', '0.5', '19.8', '1' ],
         'eya' => [ 'XLOC_000445', '0.003', '10.3', '10' ],
         'six' => [ 'XLOC_00099', '0.095', '70.0', '30' ],
         'six-eya' => [ 'XLOC_0234324', '0.05',  '19.8', '40' ]
    },
    'c-myc' => {
        'eya' => [ 'XLOC_00010', '0.22', '6.5', '41' ],
        'six' => [ 'XLOC_025437', '0.018', '1.1', '22' ],
        'six-eya' => [ 'XLOC_001045', '0.0001', '2.3', '12' ]
    },
    'cyclinD1' => {
        'control' => [ 'XLOC_000230', '0.003', '9.4', '43' ],
        'six' => [ 'XLOC_025437', '0.0008', '1.0', '54' ],
        'six-eya' => [ 'XLOC_001000', '0.02', '4.3', '34' ]
    },
    'ezrin' => {
        'eya' => [ 'XLOC_000115', '0.03', '7.3', '66' ],
        'six' => [ 'XLOC_000001', '0.00006', '7.9', '20' ],
        'six-eya' => [ 'XLOC_0234322', '0.0225', '9.0', '21']
    },
    'slc12a2' => {
        'six-eya' => [ 'XLOC_091345', '0.005', '9.3', '24' ]
    },
    'sox2' => {
        'control' => [ 'XLOC_000440', '0.0012', '1.1', '26' ],
        'eya' => [ 'XLOC_00010', '0.0002', '2.3', '65'],
        'six' => [ 'XLOC_000347', '0.5', '8.3', '76' ]
    }
};

这会产生不稳定的输出: 问题 1

  • $count是指望每一次$duplicates{$gene}{$condition}发生,所以当重复基因的条件数量不均匀时会出现缺陷,例如无调性。从输入数据中,您可以看到atonal应该包含:

    'atonal1' => { 'eya' => ['XLOC_000445', '0.003', '10.3', '10' ], '六' => ['XLOC_00099', '0.095', '70.0', '30' ], 'six-eya' => ['XLOC_0234324', '0.05', '19.8', '40'] };

[1]atonal1

'[1]atonal1' => {
    'eya' => ['XLOC_000445', '0.3',  '9.9', '42' ],
    'six' => ['XLOC_00099', '0.95',  '7.0', '64'],
    'six-eya' => ['XLOC_091345', '0.005', '9.3', '24'],
    'control' => ['XLOC_0234324', '0.5', '19.8', '1']
};

重复项之间的值混合在一起的地方。

但是,我现在要做的是删除任何重复项中包含第二个键“控制”中的值的任何基因。在此示例中,我希望转储程序输出如下所示:

$VAR1 = {
    'c-myc' => {
        'eya' => [ 'XLOC_00010', '0.22', '6.5', '41' ],
        'six' => [ 'XLOC_025437', '0.018', '1.1', '22' ],
        'six-eya' => [ 'XLOC_001045', '0.0001', '2.3', '12' ]
    },
    'ezrin' => {
        'eya' => [ 'XLOC_000115', '0.03', '7.3', '66' ],
        'six' => [ 'XLOC_000001', '0.00006', '7.9', '20' ],
        'six-eya' => [ 'XLOC_0234322', '0.0225', '9.0', '21' ]
    },
    'slc12a2' => {
        'six-eya' => [ 'XLOC_091345', '0.005', '9.3', '24' ]
    },
};
4

1 回答 1

1

我使用了您的数据结构,然后:

my $hash = {
'[1]atonal1' => {
    'eya' => [
    'XLOC_000445',
    '0.3',
    '9.9',
    '42'
    ],
    'six' => [
    'XLOC_00099',
    '0.95',
    '7.0',
    '64'
    ],
    'six-eya' => [
    'XLOC_091345',
    '0.005',
    '9.3',
    '24'
    ]
},
'atonal1' => {
    'control' => [
    'XLOC_0234324',
    '0.5',
    '19.8',
    '1'
    ],
    'eya' => [
    'XLOC_000445',
    '0.003',
    '10.3',
    '10'
    ],
    'six' => [
    'XLOC_00099',
    '0.095',
    '70.0',
    '30'
    ],
    'six-eya' => [
    'XLOC_0234324',
    '0.05',
    '19.8',
    '40'
    ]
},
'c-myc' => {
    'eya' => [
    'XLOC_00010',
    '0.22',
    '6.5',
    '41'
    ],
    'six' => [
    'XLOC_025437',
    '0.018',
    '1.1',
    '22'
    ],
    'six-eya' => [
    'XLOC_001045',
    '0.0001',
    '2.3',
    '12'
    ]
},
'cyclinD1' => {
    'control' => [
    'XLOC_000230',
    '0.003',
    '9.4',
    '43'
    ],
    'six' => [
    'XLOC_025437',
    '0.0008',
    '1.0',
    '54'
    ],
    'six-eya' => [
    'XLOC_001000',
    '0.02',
    '4.3',
    '34'
    ]
},
'ezrin' => {
    'eya' => [
    'XLOC_000115',
    '0.03',
    '7.3',
    '66'
    ],
    'six' => [
    'XLOC_000001',
    '0.00006',
    '7.9',
    '20'
    ],
    'six-eya' => [
    'XLOC_0234322',
    '0.0225',
    '9.0',
    '21'
    ]
},
'slc12a2' => {
    'six-eya' => [
    'XLOC_091345',
    '0.005',
    '9.3',
    '24'
    ]
},
'sox2' => {
    'control' => [
    'XLOC_000440',
    '0.0012',
    '1.1',
    '26'
    ],
    'eya' => [
    'XLOC_00010',
    '0.0002',
    '2.3',
    '65'
    ],
    'six' => [
    'XLOC_000347',
    '0.5',
    '8.3',
    '76'
    ]
}
};
my @illegal_genes;
for my $gene (keys %$hash){
    push @illegal_genes, $gene if $hash->{$gene}{control}
}

for my $gene (keys %$hash){
    for (@illegal_genes){
        delete $hash->{$gene} if $gene =~ /(\[\d+\])?$_/;
    }
}
print Dumper($hash);

它打印您需要的结果。

于 2013-10-14T11:45:34.500 回答