#----------------------------- |
( $plain_text = $html_text ) =~ s/<[^>]*>//gs; #WRONG |
#----------------------------- |
use HTML::Parse; |
use HTML::FormatText; |
$plain_text = HTML::FormatText->new-> format ( parse_html ( $html_text ) ); |
#----------------------------- |
#% perl -pe 's/<[^>]*>//g' file |
#----------------------------- |
#<IMG SRC = "foo.gif" |
# ALT = "Flurp!"> |
#----------------------------- |
#% perl -0777 -pe 's/<[^>]*>//gs' file |
#----------------------------- |
{ |
local $/; |
# temporary whole-file input mode |
$html = <FILE>; |
$html =~ s/<[^>]*>//gs; |
} |
#----------------------------- |
#<IMG SRC = "foo.gif" ALT = "A > B"> |
# |
#<!-- <A comment> --> |
# |
#<script>if (a<b && a>c)</script> |
# |
#<# Just data #> |
# |
#<![INCLUDE CDATA [ >>>>>>>>>>>> ]]> |
#----------------------------- |
#<!-- This section commented out. |
# <B>You can't see me!</B> |
#--> |
#----------------------------- |
package MyParser; |
use HTML::Parser; |
use HTML::Entities qw(decode_entities); |
@ISA = qw(HTML::Parser); |
sub text { |
my ( $self , $text ) = @_ ; |
print decode_entities( $text ); |
} |
package main; |
MyParser->new->parse_file(*F); |
#----------------------------- |
( $title ) = ( $html =~ m #<TITLE>\s*(.*?)\s*</TITLE>#is); |
#----------------------------- |
# download the following standalone program |
#!/usr/bin/perl |
# htitle - get html title from URL |
die "usage: $0 url ...\n" unless @ARGV ; |
require LWP; |
foreach $url ( @ARGV ) { |
$ua = LWP::UserAgent->new(); |
$res = $ua ->request(HTTP::Request->new(GET => $url )); |
print "$url: " if @ARGV > 1; |
if ( $res ->is_success) { |
print $res ->title, "\n" ; |
} else { |
print $res ->status_line, "\n" ; |
} |
} |
#----------------------------- |
#% htitle http://www.ora.com |
#www.oreilly.com -- Welcome to O'Reilly & Associates! |
# |
#% htitle http://www.perl.com/ http://www.perl.com/nullvoid |
#http://www.perl.com/: The www.perl.com Home Page |
#http://www.perl.com/nullvoid: 404 File Not Found |
#----------------------------- |
by: 发表于:2017-09-18 17:48:52 顶(0) | 踩(0) 回复
??
回复评论