perl入门06:常用字符串处理函数
2020-04-19 本文已影响0人
小贝学生信
1、 index
- 返回子字符串在字符串中最先出现的位置;
- 注意从0开始计算,可以理解为前面有多少字符;
- 找不到则返回-1;
-
rindex
则确定搜索子字符串在字符串中最后出现的位置。
#!/usr/bin/perl -w
use strict;
my $seq="TGAAACTCTACAATCTGAAAGATGCACAACGAGCAGGTAAGCTATGCGCAAGCCGTAACCCAGGGGTTAA";
my $where=index ($seq,"ATG"); #要加小括号
my $where=index ($seq,"ATG",23); #从字符串第24个碱基开始搜索
my $where=index ($seq,"ATG",index ($seq,"ATG")+1);
#寻找第二次出现ATG的位置
my $where=rindex ($seq,"ATG"); #最后出现的位置
print "$where\n";
2、 substr
- 截取特定长度子字符串
#!/usr/bin/perl -w
use strict;
my $seq="TGAAACTCTACAATCTGAAAGATGCACAACGAGCAGGTAAGCTATGCGCAAGCCGTAACCCAGGGGTTAA";
my $atg=index($seq,"ATG");
my $taa=rindex($seq,"TAA");
my $len=$taa-$atg;
my $output=substr($seq,$atg,$len);
#参数依次为原字符串、起始位置,截取长度
print "$output\n"
-
substr
也有替换功能
my $string="Hello,world!"
substr ($strict,0,5)="Goodbye";
#将原字符串的前5个字符替换为Goodbye
3、 排序
3.1、按照数值大小排序
-
sort
的数字排序默认按照码值,但实际情况下,我们更希望是按照值大小; - 加入飞碟操作符,即可
sort (b) <>; #按照值,从小到大排序;
sort (a) <>; #按照值,从大到小排序;
#!/usr/bin/perl -w
use strict;
my @array=(1..20);
my @sort=sort @array;
my @sort_by_number1=sort {$a<=>$b} @array;
my @sort_by_number2=sort {$b<=>$a} @array;
#my @sort_by_number2=reverse sort {$a<=>$b} @array;
print "@sort\n";
print "@sort_by_number1\n";
print "@sort_by_number2\n";
![](https://img.haomeiwen.com/i20354525/2edacc4455d4ea56.png)
3.2、字母排序
-
sort
默认字母排序:先大写后小写,再按字母表排序 -
cmp
排序与sort
抑制,但可修饰不区分大小写
#!/usr/bin/perl -w
use strict;
my @array=qw /A D k z E G F a k B c k z/;
my @sort=sort @array;
my @sort_by_number1=sort {$a cmp $b} @array;
my @sort_by_number2=sort {"\L$a" cmp "\L$b"} @array;
# 不区分大小写的字母表排序
print "@sort\n";
print "@sort_by_number1\n";
print "@sort_by_number2\n";
![](https://img.haomeiwen.com/i20354525/4e3844d49a6ca687.png)
3.3、哈希数据结构排序
- 前面说的都是对数组的排序。其实对哈希排序本质上也是对数组排序,键数组或者值数组。
- 这里以对成绩表(姓名--成绩)排序为例
#!/usr/bin/perl -w
use strict;
#一份成绩单哈希结构
my %score=(
"barney"=>95,
"fred"=>92,
"dino"=>67,
"bamm"=>81,
"tom"=>95,
"kate"=>88,
"bill"=>99,
);
#按名字(键)首字母排序
foreach (sort {$a cmp $b} keys %score) {
print "$_ => $score{$_}\n";
}
print "\n";
#根据分数(值)大小排序
foreach (sort {$score{$b} <=> $score{$a} } keys %score) {
#foreach (sort {$score{$b} <=> $score{$a} or $a cmp $b} keys %score) {
# or 语句表示相同成绩按姓名首字母排序
print "$_ => $score{$_}\n";
}
![](https://img.haomeiwen.com/i20354525/6a2502ab7a41350a.png)
4、 其它一些字符处理函数
-
length ($seq)
; 计算字符串长度 -
chomp ($seq)
; 去掉换行符 -
chop ($seq)
; 去掉最后一个字符 -
reverse ($seq)
; 字符串反转 -
lc ($seq)
; #将字符串变成小写 -
uc ($seq)
; #将字符串变成大
5、 生信综合练习例子
- 目的:核苷酸序列→氨基酸序列
- 思路:将氨基酸序列分成3个一组,与密码子表(哈希结构)匹配,输出。
#!/usr/bin/perl
use strict;
if (scalar @ARGV==0) {
die "This program is used to trans cds to pep
perl $0 <cds file> <pep file> \n";
}
open IN,"$ARGV[0]"; #输入fasta核苷酸序列
open OU,">$ARGV[1]"; #输出氨基酸序列
$/=">";<IN>;
while (<IN>) {
chomp;
next if (/^\s+$/); #去除空白行
my ($id,$dna)=(split /\n/,$_,2)[0,1];
#分开ID与序列,并将序列都在一行内
$dna=~ s/\n//g;
my $protein=""; #用来存储翻译好的氨基酸序列
for(my $i=0; $i < (length($dna) - 2) ; $i += 3) {
$protein .= &codon2aa( substr($dna,$i,3) );
#利用substr每次读取三个核苷酸,赋给子函数
#利用子函数将转换为氨基酸
}
print OU ">$id\n";
print OU "$protein\n";
}
#编写子函数
$/="\n";
sub codon2aa {
my($codon) = @_;
$codon = uc $codon; #转换成大写
my(%genetic_code) = (
'TCA' => 'S', # Serine
'TCC' => 'S', # Serine
'TCG' => 'S', # Serine
'TCT' => 'S', # Serine
'TTC' => 'F', # Phenylalanine
'TTT' => 'F', # Phenylalanine
'TTA' => 'L', # Leucine
'TTG' => 'L', # Leucine
'TAC' => 'Y', # Tyrosine
'TAT' => 'Y', # Tyrosine
'TAA' => '', # Stop
'TAG' => '', # Stop
'TGC' => 'C', # Cysteine
'TGT' => 'C', # Cysteine
'TGA' => '', # Stop
'TGG' => 'W', # Tryptophan
'CTA' => 'L', # Leucine
'CTC' => 'L', # Leucine
'CTG' => 'L', # Leucine
'CTT' => 'L', # Leucine
'CCA' => 'P', # Proline
'CCC' => 'P', # Proline
'CCG' => 'P', # Proline
'CCT' => 'P', # Proline
'CAC' => 'H', # Histidine
'CAT' => 'H', # Histidine
'CAA' => 'Q', # Glutamine
'CAG' => 'Q', # Glutamine
'CGA' => 'R', # Arginine
'CGC' => 'R', # Arginine
'CGG' => 'R', # Arginine
'CGT' => 'R', # Arginine
'ATA' => 'I', # Isoleucine
'ATC' => 'I', # Isoleucine
'ATT' => 'I', # Isoleucine
'ATG' => 'M', # Methionine
'ACA' => 'T', # Threonine
'ACC' => 'T', # Threonine
'ACG' => 'T', # Threonine
'ACT' => 'T', # Threonine
'AAC' => 'N', # Asparagine
'AAT' => 'N', # Asparagine
'AAA' => 'K', # Lysine
'AAG' => 'K', # Lysine
'AGC' => 'S', # Serine
'AGT' => 'S', # Serine
'AGA' => 'R', # Arginine
'AGG' => 'R', # Arginine
'GTA' => 'V', # Valine
'GTC' => 'V', # Valine
'GTG' => 'V', # Valine
'GTT' => 'V', # Valine
'GCA' => 'A', # Alanine
'GCC' => 'A', # Alanine
'GCG' => 'A', # Alanine
'GCT' => 'A', # Alanine
'GAC' => 'D', # Aspartic Acid
'GAT' => 'D', # Aspartic Acid
'GAA' => 'E', # Glutamic Acid
'GAG' => 'E', # Glutamic Acid
'GGA' => 'G', # Glycine
'GGC' => 'G', # Glycine
'GGG' => 'G', # Glycine
'GGT' => 'G', # Glycine
);
if(exists $genetic_code{$codon}) {
return $genetic_code{$codon};
}else{
return "X"; #未匹配到的情况
}
}
perl 1.pl gene.ffn gene.pep
head gene.pep
![](https://img.haomeiwen.com/i20354525/112a3c2791f10104.png)