[perl]代码库
#-----------------------------
( $plain_text = $html_text ) =~ s/<[^>]*>//gs; #WRONG
#-----------------------------
use HTML::Parse;
use HTML::FormatText;
$plain_text = HTML::FormatText->new->format ( parse_html ( $html_text ) );
#-----------------------------
#% perl -pe 's/<[^>]*>//g' file
#-----------------------------
#<IMG SRC = "foo.gif"
# ALT = "Flurp!">
#-----------------------------
#% perl -0777 -pe 's/<[^>]*>//gs' file
#-----------------------------
{
local $/;
# temporary whole-file input mode
$html = <FILE>;
$html =~ s/<[^>]*>//gs;
}
#-----------------------------
#<IMG SRC = "foo.gif" ALT = "A > B">
#
#<!-- <A comment> -->
#
#<script>if (a<b && a>c)</script>
#
#<# Just data #>
#
#<![INCLUDE CDATA [ >>>>>>>>>>>> ]]>
#-----------------------------
#<!-- This section commented out.
# <B>You can't see me!</B>
#-->
#-----------------------------
package MyParser;
use HTML::Parser;
use HTML::Entities qw(decode_entities);
@ISA = qw(HTML::Parser);
sub text {
my($self, $text) = @_;
print decode_entities($text);
}
package main;
MyParser->new->parse_file(*F);
#-----------------------------
($title) = ($html =~ m#<TITLE>\s*(.*?)\s*</TITLE>#is);
#-----------------------------
# download the following standalone program
#!/usr/bin/perl
# htitle - get html title from URL
die "usage: $0 url ...\n" unless @ARGV;
require LWP;
foreach $url (@ARGV) {
$ua = LWP::UserAgent->new();
$res = $ua->request(HTTP::Request->new(GET => $url));
print "$url: " if @ARGV > 1;
if ($res->is_success) {
print $res->title, "\n";
} else {
print $res->status_line, "\n";
}
}
#-----------------------------
#% htitle http://www.ora.com
#www.oreilly.com -- Welcome to O'Reilly & Associates!
#
#% htitle http://www.perl.com/ http://www.perl.com/nullvoid
#http://www.perl.com/: The www.perl.com Home Page
#http://www.perl.com/nullvoid: 404 File Not Found
#-----------------------------
by: 发表于:2017-09-18 17:48:52 顶(0) | 踩(0) 回复
??
回复评论