Perl Tutorial - Practical Extraction and Reporting Language (Perl)
Please leave a remark at the bottom of each page with your useful suggestion.
Table of Contents
- Perl Introduction
- Perl Program Startup
- Perl Regular Expressions
- Perl Array Program
- Perl Basic Program
- Perl Subroutine / Function Program
- Perl XML Program
- Perl String Program
- Perl Statement Program
- Perl Network Program
- Perl Hash Program
- Perl File Handling Program
- Perl Data Type Program
- Perl Database Program
- Perl Class Program
- Perl CGI Program
- Perl GUI Program
- Perl Report Program
Perl Regular Expressions
$1 gets 'A', $2 gets' B'
$str = "A and B";
$str =~ s/(A) and (B)/$2, $1/i;
print $str;
A greedy quantifier
$string="this is a test. this is another test";
$string =~ s/is.*/is not/;
print "$string\n";
# Turning off greed
$string="this is a test. this is another test";
$string =~ s/is.*?/is not/;
print "$string\n";
Alternation: /John|Karen|Steve/ will match a line containing John or Karen or Steve.
# Alternation: this, that, and the other thing
while(<DATA>){
print if /Mary|Betty|Jon/;
}
__DATA__
Mary
Betty Boop
Mark
A negative look ahead
while(<DATA>){
print if/^\w+\s(?![BC])/;
}
__DATA__
ABC
CBC
A negative look behind
while(<DATA>){
print if /(?<!B) B[a-z]*/;
}
__DATA__
Betty
CBC
ABC
A positive look ahead
$string="I love chocolate.";
$string =~ s/chocolate(?= ice)/vanilla/;
print "$string\n";
$string="this is a test.";
$string =~ s/this(?=is)/test/g;
print "$string\n";
A positive look behind
$string="chocolate/cake/milk/ice cream.";
$string =~ s/(?<= chocolate) milk/ candy/;
print "$string\n";
$string="this is a test.";
$string =~ s/(?<=a test) is/ is not/;
print "$string\n";
A program that illustrates the use of the matching operator.
#!/usr/local/bin/perl
print ("Ask me a question politely:\n");
$question = <STDIN>;
if ($question =~ /please/) {
print ("Thank you for being polite!\n");
} else {
print ("That was not very polite!\n");
}
A program that loops using a pattern.
#!/usr/local/bin/perl
while ("balata" =~ /.a/g) {
$match = $&;
print ("$match\n");
}
A simple integer-validation program.
#!/usr/local/bin/perl
print ("Enter a number:\n");
$number = <STDIN>;
chop ($number);
if ($number =~ /^-?\d+$|^-?0[xX][\da-fa-F]+$/) {
print ("$number is a legal integer.\n");
} else {
print ("$number is not a legal integer.\n");
}
@a = split (//, $s);
$s = "Hello";
@a = split (//, $s);
print "@a\n";
A word-count program that handles multiple spaces and tabs between words.
#!/usr/local/bin/perl
$wordcount = 0;
$line = <STDIN>;
while ($line ne "") {
chop ($line);
@words = split(/[\t ]+/, $line);
$wordcount += @words;
$line = <STDIN>;
}
print ("Total number of words: $wordcount\n");
Back referencing
while(<DATA>){
($first, $last)=/(\w+) (\w+)/; # Could be: (\S+) (\S+)/
print "$last, $first\n";
}
__DATA__
First Last
Backreferencing and greed
$fruit="apples pears peaches plums";
$fruit =~ /(.*)\s(.*)\s(.*)/;
print "$1\n";
print "$2\n";
print "$3\n";
print "-" x 30, "\n";
$fruit="apples books";
$fruit =~ /(.*?)\s(.*?)\s(.*?)\s/; # Turn off greedy quantifier
print "$1\n";
print "$2\n";
print "$3\n";
Backreferencing and greedy quantifiers
$string="asdfadfadfadsfasdf YOU!";
$string=~s/(.*C)(.*)/HEY/; # Substitute the whole string with HEY
print $1, "\n";
print $2, "\n";
print "$string\n";
Beginning and end of word anchors
while(<DATA>){
print if /\bJon\b/;
}
__DATA__
Jonathan
Jason
Mary
Mark
Beginning of line anchor
while(<DATA>){
print if /^[JK]/;
}
__DATA__
Mark
Mary
Jack
Kate
\B matches only if the pattern is contained in a word.
/\Bdef/ matches abcdef, but not def.
/def\B/ matches defghi,
/\Bdef\B/ matches cdefg or abcdefghi
Capitalize all sentences
#!usr/bin/perl
use strict;
use warnings;
my $string1 = "lets see. This is a test";
capitalize( $string1 );
sub capitalize
{
my $string = shift();
print "$string\n";
$string =~ s/(([.!?]|\A)\s*)([a-z])/$1\u$3/g;
print "$string\n";
}
Changing substitution delimiters
#!/usr/bin/perl -w
$path = "/usr/bin";
# Change /usr/bin to /usr/local/bin.
$path =~ s{/usr/bin} </usr/local/bin>;
print "New path: $path.\n";
Character Class: Alternative Characters
Metacharacter What It Matches
was|were|will Matches one of was, were, or will
Check beginning
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check beginning.
if ($patt =~ /^perl/) {
print "\tFound perl at start in \"$patt\".\n";
}
}
Check beginning and end
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check beginning and end.
if ($patt =~ /^perl$/) {
print "\tFound only perl in \"$patt\".\n";
}
}
Check end
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check end.
if ($patt =~ /perl$/) {
print "\tFound perl at end in \"$patt\".\n";
}
}
Check for 4-8 a's
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check for 4-8 a's.
if ($patt =~ /a{4,8}/) {
print "\tFound 4-8 a's in \"$patt\".\n";
}
}
Check for 'a'
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check for a.
if ($patt =~ /a/) {
print "\tFound an a in \"$patt\".\n";
}
}
Check for and remove a trailing backslash character
#!/usr/bin/perl
use warnings;
use strict;
my @lines = ();
while (<>) {
chomp;
if (s/\\$//) {
my $line = <>;
$_.= $line, redo;
}
push @lines, $_;
}
foreach (0..$#lines) {
print "$_ : $lines[$_] \n";
}
Check for a or b
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check for a or b.
if ($patt =~ /a|b/) {
print "\tFound an a or b in \"$patt\".\n";
}
}
Check for digit
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check for digit.
if ($patt =~ /\d/) {
print "\tFound a digit in \"$patt\".\n";
}
}
Check for leading alpha character, rest alphanumeric
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check for leading alpha character, rest alphanumeric.
if ($patt =~ /^[a-zA-Z]+[_0-9a-zA-Z]*$/) {
print "\tLeading alpha, then alphanumeric in \"$patt\".\n";
}
}
Check for no 'a'
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check for no a.
if ($patt !~ /a/) {
print "\tNO a in \"$patt\".\n";
}
}
Check for no digit
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check for no digit.
if ($patt =~ /\D/) {
print "\tFound non-digit in \"$patt\".\n";
}
}
Check for white space
#!/usr/bin/perl -w
foreach $patt (@ARGV) {
# Check for white space.
if ($patt =~ /\s/) {
print "\tFound white space in \"$patt\".\n";
}
}
Check the frequency
#!/usr/bin/perl
use warnings;
use strict;
sub frequency {
my $text = join('', @_);
my %letters;
foreach (split //, $text) {
$letters{$_}++;
}
return %letters;
}
my $text = "this is a test";
my %count = frequency($text);
foreach (sort keys %count) {
print "\t", $count{$_},
" '$_", ($count{$_} == 1)?
"'": "'s",
"\n";
}
Clustering and anchors
while(<DATA>){
# print if /^A|B/;
print if /^(S|B)/;
}
__DATA__
S B
B B
A C
N C
J D
K E
Clustering or grouping
while(<DATA>){
print if /\s(12){3}$/;
# match exactly 3 consecutive occurrences of 12 at the end of the line
}
__DATA__
Mary
Betty Boop
Mark 123
123 123 123
Combine [] with + to match a sequence of characters of any length.
/d[eE]+f/
This matches all of the following:
def
dEf
deef
dEef
dEEEeeeEef
Counting using tr.
#!/usr/local/bin/perl
$countstring = "test";
@chars = split (/\s*/, $countstring);
while ($input = <>) {
$_ = $input;
foreach $char (@chars) {
eval ("\$count = tr/$char/$char/;");
$count{$char} += $count;
}
}
foreach $char (sort (@chars)) {
print ("$char appears $count{$char} times\n");
}
Count the match times
$text = "Name: Anne Name: A Name: C Name: D";
$match = 0;
while ($text =~ /Name: *(\w+)/g) {
++$match;
print "Match number $match is $1.\n";
}
/de{1,3}f/ matches d, followed by one, two, or three occurrences of e, followed by f.
To specify an exact number of occurrences, include only one value between the { and the }.
/de{3}f/ specifies exactly three occurrences of e, which means
this pattern only matches deeef.
Defaults to split(' ', $_).
#!usr/bin/perl
use warnings;
use strict;
$_ = "Separated by whitespace\n";
print "$_\n" foreach (split);
Determine if a string has a digit.
use strict;
use warnings;
$string = "hello there";
if ( $string =~ /[0-9]/ ) {
print "'$string' has a digit.\n";
}
else {
print "'$string' has no digit.\n";
}
$string = "this one has a 2";
if ( $string =~ /[0-9]/ ) {
print "'$string' has a digit.\n";
}
else {
print "'$string' has no digit.\n";
}
/d.f/ matches d, followed by any non-newline character, followed by f
The . character is often used in conjunction with the * character.
/d.*f/ matches any string that contains the character d preceding the character f
Divide according to multiple separators.
#!usr/bin/perl
use warnings;
use strict;
my $string = "\nThis-is:a\@test-for,*you.";
my @words = split( /[-:@*]/, $string );
print "$_\n" foreach ( @words );
Divide it into a specific number of fields.
#!usr/bin/perl
use warnings;
use strict;
my $string = "this is a test.";
my @words = split(/ /, $string, 3);
print "\n";
print "$_\n" foreach ( @words );
ei modifier
$_ = "knock knock, who is there.\n";
s/knock/"knock, " x 2 . "knocking"/ei;
print "He's $_;
End of line anchor
while(<DATA>){
print if /10$/;
}
__DATA__
1.10
.5
555.10
4.01
.501
601
Escape Sequences for Special Characters
To include a backslash in a pattern, specify two backslashes:
/\\+/
This pattern tests for one or more occurrences of \ in a string.
Escape sequences, \n and \t
while(<DATA>){
print if s/\n/\t/;
}
__DATA__
AA AA AQA 101
Evaluate replacement
$str = "He gave me 5 dollars.\n";
s/5/6*7/e;
print $str;
Get all numbers
#!/usr/bin/perl
use warnings;
my $text = "3 A, 2 B, and 0 C";
$text =~ s/\b(\d+)\b/$1 > 0?$1 > 1?$1 > 2?
"Several":"A pair of"
:"One":"No"/ge;
print $text, "\n";
Get begin of the line
$_ = "This text\nhas multiple lines.";
s/^/BOL/g;
s/$/EOL/g;
print;
Greedy and non-greedy quantifiers
#!usr/bin/perl
use strict;
use warnings;
my $string1 = "there here There here there Here";
my $string2 = $string1;
print "$string1\n";
$string1 =~ s/N.*here\.//;
print "$string1\n";
print "$string2\n";
$string2 =~ s/N.*?here\.//;
print "$string2\n\n";
Greedy Matches
$p="pattern";
#A greedy match matches the entire pattern
$p =~ /(.*)/;
print "1==>$1<==\n";
@array1 = (1, 1, 1, 1);
@array2 = grep {$_ *= 2} @array1;
print @array1[1];
grep function with regular expression
@a = qw(This is a test);
@b = grep/^\w{4}/, @a;
print "@b";
grep value
@a = (1 .. 10);
@b = grep {$_ > 5} @a;
print join(", ", @b);
grep with regular rxpression
print join(" ",(grep {!/^\w{4}$/} (qw(Here are some four letter words.))));
if ($text =~ /^[+-]\d+\.\d*$/) {print "It's a number.";}
$text = "-3.1415";
if ($text =~ /^[+-]\d+\.\d*$/)
{print "It's a number.";}
if ($text =~ /(\d+)/) {print "Here's the number of apples: $1.\n";}
$text = "I have 4 apples.";
if ($text =~ /(\d+)/)
{print "Here's the number of apples: $1.\n";}
if ($text =~ /^\d+$/) {print "It's a number.";}
$text = "345";
if ($text =~ /^\d+$/)
{print "It's a number.";}
if ($text =~ /\D/) {print "It's not a number.";}
$text = "Hello!";
if ($text =~ /\D/)
{print "It's not a number.";}
Ignore case, global substitution
$str="BLue, BLUE...";
$str = ~ s/blue/upbeat/ig;
print $str;
Increase salary with pattern match
$salary=50000;
$salary =~ s/$salary/$& * 1.1/e;
print "\$& is $&\n";
print "The salary is now \$$salary.\n";
Inline Modifiers
#!/usr/bin/perl
use warnings;
use strict;
my $string = "There's more than One Way to do it!";
print "Enter a test expression: ";
my $pat = <STDIN>;
chomp($pat);
if ($string =~ /$pat/) {
print "'$pat' matches the sample string.\n";
} else {
print "No match found for '$pat'";
}
Introducing the translation operator
#!usr/bin/perl
use warnings;
use strict;
my $string = "this is a test";
$_ = "hello, abc";
print "$string\n";
print "$_\n\n";
$string =~ tr/a-z/0123456789.,';)(*&^%#@!/;
tr/a-z/0123456789.,';)(*&^%#@!/;
print "$string\n";
print "$_\n";
Iterate over matches with foreach and $_ and nested foreach
#!/usr/bin/perl
use warnings;
use strict;
my $text = "one, two, three, four";
foreach ($text =~ /\b(\w+)\b/g) {
print "outer: got: $_, matched: $&, extracted: $1 \n";
foreach (/(\w)/g) {
print "\tinner: got: $_, matched $&, extracted $1 \n";
}
}
Iterate over matches with foreach and $_ and nested while loop
#!/usr/bin/perl
use warnings;
use strict;
my $text = "one, two, three, four";
while ($text =~ /\b(\w+)\b/g) {
print "outer: matched: $&, extracted: $1 \n";
while ($1 =~ /(\w)/g) {
print "\tinner: matched $&, extracted $1 \n";
}
}
Iterate over matches with while and $1
#!/usr/bin/perl
use warnings;
use strict;
my $text = "one, two, three, four";
while ($text =~ /\b(\w+)\b/g) {
print $1, "\n";
}
Match any number of alphanumerics followed by a single space character
$p = "This is a pattern test.";
if ($p =~ /(\w*\s)/){ print "$1\n"; }
Match at least one alphanumeric followed by a single space character
if ($p =~ /(\w+\s)/){ print "$1\n"; }
Match case
#!/usr/bin/perl -w
use strict;
$_ = "Case";
if (/case/) {
print "found\n";
} else {
print "not found\n";
}
Matched index
#!/usr/bin/perl
use warnings;
use strict;
my $text= "Testing";
if ($text =~ /((?:T|N)est(ing|er))/) {
print " \$1 = $1 \n \$2 = $2 \n \$3 = $3 \n \$4 = $4 \n";
}
Matching Any Letter or Number
range [a-z] matches any lowercase letter
range [A-Z] matches any uppercase letter
/[A-Z][A-Z]/ matches any two uppercase letters.
To match any uppercase letter, lowercase letter, or digit,
use the following range: /[0-9a-zA-Z]/
Matching Modifiers
#Modifier Meaning
#i Turn off case sensitivity.
#m Treat a string as multiple lines.
#o Compile pattern only once.
#s Treat string as a single line when a newline is embedded.
#x Permit comments in a regular expression and ignore whitespace.
#g Match globally; i.e., find all occurrences.
while(<DATA>){
print if /B/;
# Print the line if it matches B
}
__DATA__
S
B
I
N
J
K
Match multiline patterns?
#!/usr/local/bin/perl -w
use strict;
$/ = ""; # Paragraph mode
while(<>) {
print $1 if /(lines.*\n.*spaces)/s;
}
Match numbers
#!/usr/bin/perl
use warnings;
use strict;
my $text = "One Two Three 456 Seven";
while ($text =~ /[0-9]+/g) {
print " \$& = $& \n \$` = $` \n \$' = $' \n";
}
Match one or zero alphanumerics followed by a single space character
$p = "This is a pattern test.";
if ($p =~ /(\w?\s)/){ print "$1\n"; }
if ($p =~ /(\w\s)/){ print "$1\n"; }
Match the first alphanumeric character
$p = "This is a pattern test.";
if ($p =~ /(\w)/){ print "$1\n"; }
Match the new line character
$_ = "This text\nhas multiple lines.";
s/^/BOL/g;
s/$/EOL/g;
print;
Match the one or more alphanumerics.
$p = "This is a pattern test.";
if ($p =~ /(\w)*/){ print "$1\n"; }
if ($p =~ /(\w)+/){ print "$1\n"; }
Match zero or one characters
$p = "This is a pattern test.";
if ($p =~ /(\w)?/){ print "$1\n"; }
Meta-Character ASCII Values
Code Meaning
\a Alarm
\b Word boundary
\B Not word boundary
\d Digit
\D Not digit
\e Escape
\f Form feed
\n Newline
\r Carriage return
\s Space character (space, \t, \n, \r, \f)
\S Not space character
\t Tab
\w Word
\W Not word
\oNN Octal
\xNN Hexadecimal
\cC Control character
Metacharacters and metasymbols
while(<DATA>){
print if /[ABC]\D/
}
__DATA__
Steve
Mary
Abort
Jack
Metacharacters for Single Characters
Metacharacter What It Matches
. Matches any character except a newline
[a-z0-9_] Matches any single character in set
[^a-z0-9_] Matches any single character not in set
\d Matches a single digit
\D Matches a single nondigit; same as [^0-9]
\w Matches a single alphanumeric (word) character; same as [a-z0-9_]
\W Matches a single nonalphanumeric (nonword) character; same as [^a-z0-9_]
Meta-Characters listing
Character Meaning
\ Escape, do not interpret the following meta-character
| OR, match either of the alternatives
( ) Create a single expression or atom
{ } Define the minimum and/or maximum repetitions of an atom
* Match an atom zero or more times
+ Match an atom one or more times
? Match an atom zero or one times
^ Match an atom at the start of the string
$ Match an atom at the end of the string
[ ] Match one of the enclosed atoms
. Match any character
\A Alternative to meta-character ^
\Z Alternative to meta-character $
Metacharacters that Turn off Greediness
# Greedy and not greedy
$_="abcdefghijklmnopqrstuvwxyz";
s/[a-z]+/XXX/;
print $_, "\n";
$_="abcdefghijklmnopqrstuvwxyz";
s/[a-z]+?/XXX/;
print $_, "\n";
Metasymbols and subpatterns
while(<DATA>){
s/(\w+)\s(\w+)/$2, $1/; # Reverse first and last names
print;
}
__DATA__
First Last
Multiplies every integer in a file by 2
#!/usr/local/bin/perl
$count = 0;
while ($ARGV[$count] ne "") {
open (FILE, "$ARGV[$count]");
@file = <FILE>;
$linenum = 0;
while ($file[$linenum] ne "") {
$file[$linenum] =~ s/\d+/$& * 2/eg;
$linenum++;
}
close (FILE);
open (FILE, ">$ARGV[$count]");
print FILE (@file);
close (FILE);
$count++;
}
Occurences of planet were changed to world
use strict;
use warnings;
my $string = "This planet is our planet.";
print "$string\n";
my $matches = $string =~ s/planet/world/g;
print "$matches occurences of planet were changed to world.\n";
print "The new string is: $string\n";
Options for the S Operator
Option Meaning
s/pattern/substitution/g Replace all matches of pattern.
s/pattern/substitution/i Ignore case in matching.
s/pattern/substitution/e Evaluate substitution as an expression.
s/pattern/substitution/o Compile expression only once.
s/pattern/substitution/m Allow ^ to match after every newline and
$ to match before every newline in the string being searched.
s/pattern/substitution/s Allows a period to match a newline.
Options for the substitution operator.
Option Description
g Change all occurrences of the pattern
i Ignore case in pattern
e Evaluate replacement string as expression
m Treat string to be matched as multiple lines
o Evaluate only once
s Treat string to be matched as single line
x Ignore white space in pattern
Options for the translation operator.
Option Description
c Translate all characters not specified
d Delete all specified characters
s Replace multiple identical output characters with a single character
Parentheses and regex
#!/usr/bin/perl
use warnings;
use strict;
my $text= "Testing";
if ($text =~ /((T|N)est(ing|er))/) {
print " \$1 = $1 \n \$2 = $2 \n \$3 = $3 \n \$4 = $4 \n";
}
Pattern anchors in Perl.
Anchor Description
^ or \A Match at beginning of string only
$ or \Z Match at end of string only
\b Match on word boundary
\B Match inside word
Pattern array
@patterns =
(
qr/\bis\b/,
qr/\ba\b/,
qr/\bnone\b/,
);
while (<>) {
for
($loop_index = 0; $loop_index < $#patterns; $loop_index++) {
if(/$patterns[$loop_index]/) {
print "Matched pattern $loop_index!\n";
}
else {
print "Didn't match pattern $loop_index.\n";
}
}
}
Pattern match
$str="old and restless";
print "$&\n" if $str =~ /and/;
print "$'\n" if $str =~ /and/;
print "$'\n" if $str =~ /and/;
print "\nold string is: $str\n";
$str=~s/(old) and (restless)/$2 and $1/;
print "new string is: $str\n";
print "\nlast pattern matched: $+\n";
Pattern Matching Operators
#Example Meaning
#$name =~ /Tom/ True if $name contains pattern.
# Returns 1 for true, null for false.
#$name !~ /Jack/ True if $name does not contain pattern.
#$name =~ s/Jack/Sam/ Replace first occurrence of John with Sam.
#$name =~ s/Jack/Sam/g Replace all occurrences of John with Sam.
#$name =~ tr/a-z/A-Z/ Translate all lowercase letters to uppercase.
#$name =~ /$pal/ A variable can be used in the search string.
# Using the $_ scalar explicitly
while($_=<DATA>){
print $_ if $_ =~ /I/; # $_ holds the current input line
# print if /I/;
}
__DATA__
S
B
I
N
J
K
Pattern-Matching Operators(The syntax used to perform a pattern match on a string)
# The syntax used to perform a pattern match on a string is $string =~ /regular expression/expression modifier (optional)
#!/usr/local/bin/perl -w
while (<STDIN>)
{
print if ($_ =~ /the/);
}
Pattern-matching options.
Option Description
g Match all possible patterns
i Ignore case
m Treat string as multiple lines
o Only evaluate once
s Treat string as single line
x Ignore white space in pattern
Pattern Modifiers
Code Description
g Globalmatch all occurrences of the regular expression
i Ignore casematch any case
m Multiple linesprocess the input as multiple lines
o Only oncecompile the regular expression the first time
s Single lineignore new lines
x Extra spacesallow comments and spaces in regular expression syntax
Patterns containing + always try to match as many characters as possible.
For example, if the pattern /ab+/ is searching in the string
abbc
it matches abb, not ab.
Pattern Tester
#!/usr/bin/perl
use warnings;
use strict;
$_ = q("'yes' or 'no',".);
print "Enter some text to find: ";
my $pattern = "test";
chomp($pattern);
if (/$pattern/) {
print "The text matches the pattern '$pattern'.\n";
} else {
print "'$pattern' was not found.\n";
}
Posix and Unicode Classes
Shortcut Expansion Description
[[:alpha:]] [a-zA-Z] An alphabetic character.
[[:alnum:]] [0-9A-Za-z] An alphabetic or numeric character.
[[:digit:]] \d A digit, 0-9.
[[:lower:]] [a-z] A lower case letter.
[[:upper:]] [A-Z] An upper case letter.
[[:punct:]] [!"#$%&'()*+,-./:;<=>?@\[\\\]^_`{|}~] A punctuation character note the escaped characters [, \, and ].
/bea?t/ Matches either 'beat' or 'bet'
/bea+t/ Matches 'beat', 'beaat', 'beaaat'
/bea*t/ Matches 'bet', 'beat', 'beaat'
Print line unless it matches E
while(<DATA>){
print unless /E/; # Print line unless it matches E
}
__DATA__
S
B
I
N
J
E
Regex index
$text = "no and yes";
$text =~ s/(\w+) (\w+) (\w+)/$3 $2 $1/;
print $text;
Regular expression character patterns
SEQUENCE PURPOSE
\w Matches an alphanumeric character. Alphanumeric includes '_'.
\W Matches a nonalphanumeric character.
\s Matches a whitespace character. This includes spaces and tabs.
\S Matches a nonwhitespace character.
\d Matches a digit.
\D Matches a nondigit character.
\b Matches a word boundary.
\B Matches a nonword boundary.
\A Matches only at beginning of string.
\Z Matches only at end of string.
\G Matches only where previous m//g left off.
Regular expression metacharacters
METACHARACTER PURPOSE
\ accept as a regular character;
~ Matches the beginning of the string, unless /m is used to modify the expression.
. Matches any character except a new-line character, unless /s is used to modify the expression.
$ Matches the end of the string, unless /m is used to modify the expression.
| Expresses alternation.
( ) Groups expressions to assist in alternation and back referencing.
[ ] Looks for a set of characters.
Regular expression modifiers
MODIFIER NAME PURPOSE
i Makes the search case-insensitive.
m Treat this line as a multiple line.
s . matches any character except a new line.
x Allows whitespace in the expression.
Regular expression pattern quantifiers
QUANTIFIER PURPOSE
* Matches 0 or more times.
+ Matches 1 or more times.
? Matches 0 or 1 times.
{n} Matches exactly n times.
{n,} Matches at least n times.
{n,m} Matches at least n times but no more than m times.
Regular Expression Patterns
Pattern Interpretation
/a/ Looks for any instance of a.
/a+/ Matches one or more instances of a.
/a*/ Matches zero or more instances of a.
/a?/ Matches zero or one instance of a.
/a|b/ Matches either a or b.
Regular expression: start a string with period
$line = ".Hello!";
if ($line =~ m/^\./) {
print "You shouldn't start a sentence with a period!";
}
Repeating patterns
while(<DATA>){
print if /5{1,}/;
}
__DATA__
Blenheim 1.10
Ben .5
Bill 5.100
Lelly 4.01
Replace pattern
use strict;
use warnings;
my $string = "Hello to the world";
our $_ = $string;
print "The original string is: \"$_\"\n";
s/planet/world/;
print "s/planet/world/ changes string: $_ \n\n";
Replace pattern with ()
use strict;
use warnings;
my $string = "Hello to the world";
print "The original string is: \"$_\"\n";
s(world)(planet);
print "s(world)(planet) changes string: $string \n\n";
Reversing subpatterns
while(<DATA>){
s/([A-Z][a-z]+)\s([A-Z][a-z]+)/$2, $1/; # Reverse first and last names
print;
}
__DATA__
S B
B B
A C
N C
J D
K E
Searching from the beginning and the end
You can search for a pattern at a specified location, such as the beginning or end of the string.
Pattern Interpretation
/^a/ Match against a only at beginning of string.
/a$/ Match against a only at end of string.
/a\b/ Match a at end of word.
/a\B/ Match a not at end of word.
/$a/ means to match the value of $a.
/a$/ matches against an a at the end of the string.
/$a$/ matches against the value of the variable $a at the end of the string.
The ^ character acts differently depending on whether it is inside a square bracket or not.
/^a/ looks for a at the start of the string.
/[^a]/ will return true if there is any character other than a anywhere in the word.
Shortcut Expansion Description
\d [0-9] Digits 0 to 9.
\w [0-9A-Za-z_] A 'word' character allowable in a Perl variable name.
\s [ \t\n\r] A whitespace character that is, a space, a tab, a newline or a return.
\D [^0-9] Any non-digit.
\W [^0-9A-Za-z_] A non-'word' character.
\S [^ \t\n\r] A non-blank character.
Shortcuts for Regular Expressions
Pattern Interpretation
/\d/ Any digit.
/\D/ Anything other than a digit.
/\w/ Any word character, i.e., [_0-9a-zA-Z].
/\W/ Anything other than a word character (match on a non word character).
/\s/ Any white space (tab, return, space, or newline).
/\S/ Anything other than white space.
/./ Any character other than a newline.
The \w pattern matches a single character, not a whole word.
To match against a word, try \w+.
Shortest possible match
#!/usr/bin/perl
use warnings;
use strict;
my $company = 'this is a test';
my $match = $company;
while ($company =~ /(this)(?=(.*?is))/g) {
my $got = $1.$2;
$match = $got if length($got) < length($match);
}
print "Shortest possible match is '$match' \n";
Shouldn't start a sentence with a period
$line = ".Hello!";
if ($line =~ m/\A\./) {
print "Shouldn't start a sentence with a period!";
}
Skip blank lines and comments
#!/usr/bin/perl
use warnings;
use strict;
while (<>) {
chomp; # strip trailing linefeed from $_
next if /^(\s*(#.*)?)?$/; # skip blank lines and comments
print "Got: $_ \n";
}
Some Regular Expression Metacharacters
Metacharacter Represents
^ Matches at the beginning of a line
$ Matches at the end of a line
a.c Matches an 'a', any single character, and a 'c'
[abc] Matches an 'a' or 'b' or 'c'
[^abc] Matches a character that is not an 'a' or 'b' or 'c'
[0-9] Matches one digit between '0' and '9'
ab*c Matches an 'a', followed by zero or more 'b's and a 'c'
ab+c Matches an 'a', followed by one or more 'b's and a 'c'
ab?c Matches an 'a', followed by zero or one 'b' and a 'c'
(ab)+c Matches one or more occurrences of 'ab' followed by a 'c'
(ab)(c) Captures 'ab' and assigns it to $1, captures 'c' and assigns it to $2.
.* special-character combination tries to match as much as possible
if the string banana is searched using the following pattern, the pattern
matches banana, not ba or bana:
/b.*a/
The .* character only matches the longest possible string that enables the pattern match as a whole to succeed.
Split a string by using the regular expression
#!/usr/bin/perl -w
use strict;
my $passwd = "A:B:C:D::/value1/value2:/value3";
my @fields = split /:/, $passwd;
print " $fields[0]\n";
print " $fields[2]\n";
print " $fields[5]\n";
Square brackets ([ and ]) delimits a range of characters.
[aA] means either a or A.
[a-z] matches any lowercase character.
[0-9] matches any digit.
[0-9a-zA-Z] for characters commonly used in variable names.
You can combine the brackets with other patterns.
Pattern Interpretation
/[aA]/ Matches against a or A.
/[aA]+/ Matches one or more instances of a or A.
/[aA]*/ Matches zero or more instances of a or A.
/[aA]?/ Matches zero or one instance of a or A.
/[^aA]/ Returns true if any character is found that is not a or A.
/[aA]|[bB]/ Matches an instance of a or A or b or B; redundant in this case, as it is the same as /[aAbB]/.
Substitute every T with M
$name="Tom";
print "$name\n" if $name =~ s/T/M/g; # Substitute every T with M
print "What is Tom's last name? ";
print "You got it!\n" if <STDIN> =~ /Jordan/;
Substitute first occurrence of "blue" with "red"
$str = "gree, red, blue, blue, blue...";
$str =~ s/blue/red/;
print $str;
Substitute first T with an M
#!/usr/bin/perl
$name="Tom";
print "Hello Tom\n" if $name =~ /Tom/;
print "$name\n" if $name !~ /Tom/;
$name =~ s/T/M/; # Substitute first T with an M
print "$name.\n";
Substitute I with N and print
while($_= <DATA>){
print if s/I/N/;
}
__DATA__
S
B
I
N
J
K
Substitute N with J
while(<DATA>){
s/N/J/;
print;
}
__DATA__
S B
B B
I C
N C
J D
K E
Substitute tom with Mary
while(<DATA>){
print if s/tom/Mary/i; # Substitute tom with Mary
}
__DATA__
tom
Mary
Jack
James
Jon
Kate
Substitution Example
use strict;
use warnings;
my $string = "Hello to the world";
print "The original string is: \"$string\"\n";
$string =~ s/world/planet/;
print "s/world/planet/ changes string: $string \n\n";
Substitution Modifiers
Modifier Meaning
e Evaluate the replacement side as an expression.
i Turn off case sensitivity.
m Treat a string as multiple lines.[a]
o Compile pattern only once. Used to optimize the search.
s Treat string as single line when newline is embedded.
x Allow whitespace and comments within the regular expression.
g Replace globally; i.e., find all occurrences.
Swap the first two words in a string
#!/usr/bin/perl
use warnings;
use strict;
$_ = "Berkeley: LSD and UNIX";
s/(\w+)\s+(\w+)/$2 $1/;
print $_, "?\n";
$text =~ tr/a-z/d-za-c/;
$text = "hello there!";
print "$text\n";
$text =~ tr/a-z/d-za-c/;
print "$text\n";
$text =~ tr/d-za-c/a-z/;
$text = "hello there!";
print "$text\n";
$text =~ tr/d-za-c/a-z/;
print "$text\n";
The ^ and $ Pattern Anchors
^ and $ matches only at the beginning or the end of a string.
/^def/ matches def only if these are the first three characters in the string.
/def$/ matches def only if these are the last three characters in the string.
combine ^ and $ to force matching of the entire string.
/^def$/ matches only if the string is def.
The bracketed character class
while(<DATA>){
print if /[A-Za-z0-9_]/;
}
__DATA__
Tom 101
Jack 201
Nart 301
The bracket metacharacters and negation
while(<DATA>){
print if / [^123]0/
}
__DATA__
101
201
301
401
501
601
The dot metacharacter
while(<DATA>){
print "Found Norma!\n" if /N..ma/;
}
__DATA__
Tom 101
Mary 201
Jason 301
Norma 401
Jack 501
Kate 601
The dot metacharacter and the zero or more quantifier
while(<DATA>){
print if s/[A-Z].*y/Tom/;
}
__DATA__
Mary 101
Tom 201
The e modifier
$_=5;
s/5/6 * 4 - 22/e;
print "The result is: $_\n";
$_=1055;
s/5/3*2/eg;
print "The result is: $_\n";
The e Modifier Evaluating an Expression
Format: s/search pattern/replacement string/e;
# The e and g modifiers
while(<DATA>){
s/6/6 * 7.3/eg; # Substitute 6 with product of 6 * 7.3
print;
}
__DATA__
5
4
6 6
1
666
66
The g Modifier Global Substitution
#Format: s/search pattern/replacement string/g;
# Without the g option
while(<DATA>){
print if s/Tom/Jack/; # First occurrence of Tom on each line is replaced with Jack
}
__DATA__
Tom Dave Dan Tom
Betty Tom Henry Tom
Jack Norma Tom Tom
# With the g option
while(<DATA>){
print if s/Tom/Jack/g; # All occurrences of Tom on each line are replaced with Jack
}
__DATA__
Tom Dave Dan Tom
Betty Tom Henry Tom
Igor Norma Tom Tom
The Greedy Metacharacters
Metacharacter What It Matches
x? Matches 0 or 1 occurrences of x
(xyz)? Matches 0 or 1 occurrences of pattern xyz
x* Matches 0 or more occurrences of x
(xyz)* Matches 0 or more occurrences of pattern xyz
x+ Matches 1 or more occurrences of x
(xyz)+ Matches 1 or more occurrences of pattern xyz
x{m,n} Matches at least m occurrences of x and no more than n occurrences of x
The greedy quantifier
$string="ABCdefghiCxyzwerC YOU!";
$string=~s/.*C/HEY/;
print "$string", "\n";
The grep function evaluates the expression (EXPR) for each element of the array (LIST).
Format: grep(EXPR,LIST)
# Searching for patterns in a list
@list = (tomatoes, tomorrow, potatoes, phantom, Tommy);
$count = grep( /tom/i, @list);
@items= grep( /tom/i, @list);
print "Found items: @items\nNumber found: $count\n";
The i Modifier Case Insensitivity
Format: s/search pattern/replacement string/i;
# Matching with the i option
while(<DATA>){
print if /norma/i; # Turn off case sensitivity
}
__DATA__
Steve
Jack
Tom
Norma
James
Jason
The metasymbol, \d
while(<DATA>){
print if /6\d\d/
}
__DATA__
101
201
301
401
501
601
The m modifier controls the behavior of the $ and ^ anchor metacharacters.
# Anchors and the m modifier
$_="\nTomorrow will never be here.\n";
print if /^Tomorrow/; # Embedded newline
$_="\nTomorrow will never be here.\n";
print if /\ATomorrow/; # Embedded newline
$_="\nTomorrow will never be here.\n";
print if /^Tomorrow/m;
$_="\nTomorrow will never be here.\n";
print if /\ATomorrow/m;
$_="\nTomorrow will never be here.\n";
print if /history\.$/m;
The m Operator and Matching
/Regular Expression/ default delimiter
m#Regular Expression# optional delimiters
m{regular expression} pair of delimiters
The \S metasymbol and nonwhitespace
while(<DATA>){
print if s/\S/*/g;
}
__DATA__
A
AAA
AAAA
The \s metasymbol and whitespace
while(<DATA>){
print if s/\s/*/g; # Substitute all spaces with stars
}
__DATA__
A 101
A A A
The s Modifier-The Dot Metacharacter and the Newline
# The s modifier can be used with both the m (match) and the s (substitution) operators.
# The s modifier and the newline
$_="this is a test\nAnother test.\n";
print $& if /is./s;
print $& if /a\../s;
print if s/testA/test, a/s;
The s Operator and Substitution
The s operator is for substitutions.
The g stands for global change on a line.
Format
s/old/new/;
s/old/new/i;
s/old/new/g;
s+old+new+g;
s(old)/new/; s[old]{new};
s/old/expression to be evaluated/e;
s/old/new/ige;
s/old/new/x;
The s operator supports a number of variables
$& contains the matched string;
$` (a backtick character) contains everything before the matched string;
$' contains everything after the matched string.
The [] special characters enable you to define patterns that match one of a group of alternatives.
For example, the following pattern matches def or dEf:
/d[eE]f/
The split function splits up a string EXPR by some delimiter (whitespace by default) and returns an array.
#If a string is not supplied as the expression, the $_ string is split.
#You can specify more than one delimiter, using the regular expression metacharacter [ ].
#[ +\t:] represents zero or more spaces or a tab or a colon.
#LIMIT specifies the number of fields that can be split.
#Format:
#split("DELIMITER",EXPR,LIMIT)
#split(/DELIMITER/,EXPR,LIMIT)
#split(/DELIMITER/,EXPR)
#split("DELIMITER",EXPR)
#split(/DELIMITER/)
#split
# Splitting a scalar on whitespace and creating a list
$line="a b c d e";
@letter=split(' ',$line);
print "The first letter is $letter[0]\n";
print "The second letter is $letter[1]\n";
The word metasymbols
while(<DATA>){
print if / \w\w\w\w \d/
}
__DATA__
Tom 101
Jack 201
Jason 301
The x Modifier: The Expressive Modifier
#The x modifier allows you to place comments within the regular expression
$_="A to B\n";
/A # Searching for A /x;
print "Comments and spaces were removed and \$& is $&\n";
To get at each field, we can split when we see a colon:
#!/usr/bin/perl
use warnings;
use strict;
my $passwd = "A:B:1:2::/dir:/bin/bash";
my @fields = split /:/, $passwd;
print "Login name : $fields[0]\n";
print "User ID : $fields[2]\n";
print "Home directory : $fields[5]\n";
To specify a maximum number of occurrences, use 0 as the lower bound.
/de{0,3}f/ matches d, followed by no more than three es, followed by f.
To specify a minimum number of occurrences, leave off the upper bound.
/de{3,}f/ matches d, followed by at least three es, followed by f.
tr/a-z/A-Z/
#!/usr/bin/perl -w
while (<>) {
tr/a-z/A-Z/;
print;
}
tr/i/o/;
while (<>) {
tr/i/o/;
print;
}
tr/o/i/;
while (<>) {
tr/o/i/;
print;
}
Turning Off Greediness
Metacharacter What It Matches
x?? Matches 0 or 1 occurrences of x
(xyz)?? Matches 0 or 1 occurrences of pattern xyz
x*? Matches 0 or more occurrences of x
(xyz)*? Matches 0 or more occurrences of pattern xyz
x+? Matches 1 or more occurrences of x
(xyz)+? Matches 1 or more occurrences of pattern xyz
x{m,n}? Matches at least m occurrences of x and no more than n occurrences of x
x{m}? Matches at least m occurrences of x
x{m,}? Matches at least m times
Use /gc to remember position
#!/usr/bin/perl
use warnings;
use strict;
my $text = "3 2 1 abc";
while ($text =~ /(\d)/gc) {
print "$1...\n";
}
Use \G to match rest of text
#!/usr/bin/perl
use warnings;
use strict;
my $text = "3 2 1 abc";
if ($text =~ /\G\s*(.+)$/) {
print ucfirst($1), "!\n";
}
Using alternate without parentheses
#!usr/bin/perl
use strict;
use warnings;
my $string1 = "hello";
my $string2 = "hello there";
my $string3 = "hi there";
print "$string1\n$string2\n$string3\n";
print "watch this:\n";
print "1: how are you?\n" if ( $string1 =~ m/hello|hi there/ );
print "2: how are you?\n" if ( $string2 =~ m/hello|hi there/ );
print "3: how are you?\n" if ( $string3 =~ m/hello|hi there/ );
Using alternate with parentheses
#!usr/bin/perl
use strict;
use warnings;
my $string1 = "hello";
my $string2 = "hello there";
my $string3 = "hi there";
print "$string1\n$string2\n$string3\n";
print "1: how are you?\n" if ( $string1 =~ m/(hello|hi) there/ );
print "2: how are you?\n" if ( $string2 =~ m/(hello|hi) there/ );
print "3: how are you?\n" if ( $string3 =~ m/(hello|hi) there/ );
Using grep function
@numbers = ( 1 .. 10 );
print "\@numbers: @numbers\n\n";
@smallNumbers2 = grep( $_ < 6, @numbers );
print "grep: @smallNumbers2\n\n";
Using grep to remove word
print join(" ",(grep {!/^\w{4}$/} (qw(Here are some four letter words.))));
Using grep with expression
print grep(!/x/, a, b, x, d);
Using grep with the file-test operators.
#!/usr/local/bin/perl
opendir(CURRDIR, ".") ||
die("Can't open current directory");
@filelist = grep (!/^\./, grep(-r, readdir(CURRDIR)));
closedir(CURRDIR);
foreach $file (@filelist) {
open (CURRFILE, $file) || die ("Can't open input file $file");
while ($line = <CURRFILE>) {
if ($line =~ /$ARGV[0]/) {
print ("$file:$line");
}
}
close (CURRFILE);
}
Using Patterns with Substitutions
#The s operator substitutes data that match a pattern with replacement data.
#$variable =~ s/pattern/substitution/;
#The s operator returns the number of substitutions made or 0 if none occurred.
#!/usr/bin/perl -w
$p = "Jack, Software Engineer";
$p =~ s/Software Engineer/Tester/;
print "New title: $p.\n";
Using Special-Meaning Characters
\d matches any digit from zero to nine. Its equivalent is [0-9].
\D matches anything but from zero to nine. Its equivalent is [^0-9].
\s matches any white space character (tab, form feed, carriage return, space, and newline characters).
\S matches any character but a white space character.
\w matches any alphanumeric character, including the underscore. Its equivalent is [a-zA-Z0-9_].
\W matches any character but an alphanumeric character, including the underscore. Its equivalent is [^a-zA-Z0-9_].
\b matches on the word boundary.
\B matches not on a word boundary.
\oNN matches octal numbers.
\xNN matches hexadecimal numbers.
\cC matches control characters.
Using split, an anonymous list, and pattern matching
while(<DATA>){
($name, $phone, $address) = split(":", $_);
print $name if $phone =~ /408-/ # Using the pattern matching operator
}
__DATA__
A:111-444-6677:12 Main St.
B:222-222-1234:234 main Ln.
C:408-567-4444:3456 Mary Way
D:555-234-5678:8880 Main St.
E:333-444-6556:54 Main Ln.
F:444-333-7654:123 4th Ave.
Using split and pattern matching
while(<DATA>){
@line = split(":", $_);
print $line[0],"\n" if $line[1] =~ /408-/ # Using the pattern matching operator
}
__DATA__
A:111-444-6677:12 Main St.
B:222-222-1234:234 main Ln.
C:408-567-4444:3456 Mary Way
D:555-234-5678:8880 Main St.
E:333-444-6556:54 Main Ln.
F:444-333-7654:123 4th Ave.
Using substitutions to clean up input data.
#!/usr/bin/perl -w
foreach $i (0 .. $#ARGV) {
open(INPUT, $ARGV[$i]) or die "Can't open $ARGV[$i].";
while (<INPUT>) {
$string = $_;
remove_trailing($string);
remove_leading($string);
print "$string\n";
}
close(INPUT);
}
sub remove_leading {
my($string) = $_[0];
$string =~ s/^\s+//;
$_[0] = $string;
}
sub remove_trailing {
my($string) = $_[0];
$string =~ s/\s+$//;
$_[0] = $string;
}
Using tr to count the occurrences of specific characters.
#!/usr/local/bin/perl
$punctuation = $blanks = $total = 0;
while ($input = <STDIN>) {
chop ($input);
$total += length($input);
$_ = $input;
$punctuation += tr/,:;.-/,:;.-/;
$blanks += tr/ / /;
}
print ("In this file, there are:\n");
print ("\t$punctuation punctuation characters,\n");
print ("\t$blanks blank characters,\n");
print ("\t", $total - $punctuation - $blanks);
print (" other characters.\n");
Using tr to retrieve the length of a string.
#!/usr/local/bin/perl
$string = "here is a string";
$_ = $string;
$length = tr/a-zA-Z /a-zA-Z /;
print ("the string is $length characters long\n");
Using variables containing matched subpatterns.
#!/usr/local/bin/perl
while (<>) {
while (/(-?\d+)\.(\d+)([eE][+-]?\d+)?/g) {
print ("integer part $1, decimal part $2");
if ($3 ne "") {
print (", exponent $3");
}
print ("\n");
}
}
Word-Boundary Pattern Anchors
\b and \B, specify whether a matched pattern must be on a word boundary or inside a word boundary.
The \b pattern anchor specifies that the pattern must be on a word boundary.
/\bdef/ matches only if def is the beginning of a word.
\b to indicate the end of a word.
/def\b/ matches def and abcdef
/\bdef\b/ matches only the word def.
Zero width loop
#!/usr/bin/perl
use warnings;
use strict;
my $text = "this is a test";
while ($text =~ /(?=(.))/g) {
print "[$1]";
}