Learn Web.Crawling of Perl

#####
#Overview of Web.Crawling related modules.
#Note that, below codes can not be executed just for overview intention.
#####

#!/usr/bin/perl


#####
#HTTP::Thin
#####
use 5.12.1;
use HTTP::Request::Common;
use HTTP::Thin;

say HTTP::Thin->new()->request(GET 'http://example.com')->as_string;


#####
#HTTP:Tiny
#####
use HTTP::Tiny;

my $response = HTTP::Tiny->new->get('http://example.com/');
die "Failed! \n" unless $response->{success};
print "$response->{status} $response->{reason} \n";

while (my ($k, $v) = each %{$response->{headers}}) {
  for (ref $v eq 'ARRAY' ? @$v : $v) {
    print "$k: $_ \n";
  }
}

print $response->{content} if length $response->{content};

#new
$http = HTTP::Tiny->new{ %attrubutes };

#valid attributes include:
#-agent
#-cookie_jar
#-default_headers
#-local_address
#-keep_alive
#-max_redirect
#-max_size
#-https_proxy
#-proxy
#-no_proxy
#-timeout
#-verify_SSL
#-SSL_options

#get[head][put][post]delete
$response = $http->get($url);
$response = $http->get($url, \%options);
$response = $http->head($url);

#post_form
$response = $http->post_form($url, $form_data);
$response = $http->post_form($url, $form_data, \%options);

#request
$response = $http->request($method, $url);
$response = $http->request($method, $url, \%options);

$http->request('GET', 'http://user:pwd [email protected]');
#or
$http->request('GET', 'http://mars%40:pwd [email protected]');

#www_form_urlencode
$params = $http->www_form_urlencode( $data );
$response = $http->get("http://example.com/query?$params");

#SSL support
SSL_options => {
  SSL_ca_file => $file_path,
}

#proxy support


#####
#www::Mechanize
#
#Stateful programmatic web browsing, used for automating interaction with websites.
#####

use WWW::Mechanize;

my $mech = WWW::Mechanize->new();

$mech->get( $url );

$mech->follow_link( n => 3 );
$mech->follow_link( text_regex => qr/download this/i );
$mech->follow_link( url => 'http://host.com/index.html' );

$mech->submit_form(
  form_number => 3,
  fields => {
    username => 'banana',
    passoword => 'lost-and-alone',
  }
);

$mech->submit_form(
  form_name => 'search',
  fields => { query => 'pot of gold', },
  button => 'search now'
);

#testing web applications
use Test::More;

like( $mech->content(), qr/$expected/, "Got expected content" );

#page traverse
$mech->back();

#finer control over page
$mech->find_link( n => $number );
$mech->form_number( $number );
$mech->form_name( $name );
$mech->field( $name, $value );
$mech->set_fields( $field_values );
$mech->set_visible( @criteria );
$mech->click( $button );

#subclass of LWP::UserAgent, eg:
$mech->add_header( $name =>$value );


#page-fecting methods

#status methods

#content-handling methods

#link methods

#image methods

#form methods

#field methods

#miscellaneous methods

#overridden LWP::UserAgent methods
#inherited unchanced LWP::UserAgent methods

#yeah now, it's easy to implement a spider project for future integration use.
Mars

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。