[Perl]抓取句子大全的网页

There's more than one way to do it!
https://metacpan.org http://perlmonks.org
头像
523066680
Administrator
Administrator
帖子: 406
注册时间: 2016年07月19日 12:14
拥有现金: 锁定
储蓄: 锁定
Has thanked: 39 times
Been thanked: 53 times
联系:

[Perl]抓取句子大全的网页

帖子 #1 523066680 » 2018年10月06日 09:32

一、按索引抓取
分两步:
1、抓取HTML
Code: [全选] [展开/收缩] [Download] (GetHTML.pl)
  1. =info
  2.     523066680/vicyang
  3.     2018-10
  4. =cut
  5.  
  6. use utf8;
  7. use Encode;
  8. use File::Path;
  9. use File::Slurp;
  10. use LWP::UserAgent;
  11. use File::Path;
  12. use File::Basename qw/basename/;
  13. use Mojo::DOM;
  14. STDOUT->autoflush(1);
  15.  
  16. our $wdir = encode('gbk', "D:/temp/句子大全");
  17. mkpath $wdir unless -e $wdir;
  18. our $main = "http://www.1juzi.com";
  19. our $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 8 );
  20. my $res = $ua->get($main);
  21. my $html = $res->content();
  22. my $dom = Mojo::DOM->new($html);
  23.  
  24. my (@urls, @dirs);
  25. get_item($dom, \@urls, \@dirs);
  26.  
  27. my $tdir;
  28. for my $id ( 0 .. $#urls )
  29. {
  30.     printf "%s\n", $dirs[$id];
  31.     next if -e $dirs[$id];                 # Skip when this folder exists
  32.     $tdir = $dirs[$id] ."_";
  33.     mkpath $tdir unless -e $tdir;
  34.     get_alist( $main .$urls[$id], $tdir );
  35.     rename( $tdir , $dirs[$id] );          # Restore name
  36. }
  37.  
  38. sub get_item
  39. {
  40.     our $wdir;
  41.     my ($dom, $urls, $dirs) = @_;
  42.     my $menu = $dom->at(".header-menu");
  43.  
  44.     for my $e ( $menu->find("ul li a")->each )
  45.     {
  46.         push @$urls, $e->attr("href");
  47.         push @$dirs, sprintf "%s/%s/%s", $wdir, $e->parent->parent->previous->text, $e->text;
  48.     }
  49. }
  50.  
  51. sub get_alist
  52. {
  53.     our $main;
  54.     my ($url, $dir) = @_;
  55.     my $res = $ua->get( $url );
  56.     my $dom = Mojo::DOM->new( $res->content );
  57.     my @links;
  58.     @links = @{ $dom->at(".alist")->find("a")->map(attr=>"href") };
  59.  
  60.     #get_page
  61.     my $retry;
  62.     for my $link ( @links )
  63.     {
  64.         printf "    %s\n", $link;
  65.         $retry = 0;
  66.         do
  67.         {
  68.             $res = $ua->get( $main .$link );
  69.             $retry++;
  70.             print "retry times: $retry\n" if ($retry > 1 );
  71.         }
  72.         until ( $res->is_success() );
  73.  
  74.         write_file( $dir ."/". basename($link), $res->content );
  75.     }
  76. }


2、从本地HTML提取文本分类汇总
Code: [全选] [展开/收缩] [Download] (GetArticle.pl)
  1. =info
  2.     523066680/vicyang
  3.     2018-10
  4. =cut
  5.  
  6. use utf8;
  7. use Encode;
  8. use File::Slurp;
  9. use Mojo::DOM;
  10. STDOUT->autoflush(1);
  11.  
  12. our $wdir = encode('gbk', "D:/Temp/句子大全");
  13. chdir $wdir or warn "$!";
  14.  
  15. my $buff;
  16. my @files;
  17. my @dirs = `dir "$wdir" /ad /s /b`;
  18. grep { s/\r?\n//; } @dirs;
  19.  
  20. for my $dir ( @dirs )
  21. {
  22.     printf "%s\n", $dir;
  23.     chdir $dir or die "$!";
  24.     @files = glob "*.html";
  25.     next unless $#files >= 0;
  26.     $buff = "";
  27.     grep { $buff .= article( $_ ) } sort { substr($b, 0, -5) <=> substr($a, 0, -5) } @files;
  28.     write_file( "${dir}.txt", $buff );
  29. }
  30.  
  31. sub article
  32. {
  33.     my $file = shift;
  34.     my $html = decode('gbk', scalar(read_file( $file )) );
  35.     $html =~s/&nbsp;//g;
  36.  
  37.     $dom = Mojo::DOM->new( $html );
  38.     # remove tags: <script>, <u>, and next/prev page
  39.     grep { $_->remove } $dom->at(".content")->find("script")->each;
  40.     grep { $_->remove } $dom->at(".content")->find("u")->each;
  41.     $dom->at(".page")->remove;
  42.     my $title = $dom->at("h1")->all_text;
  43.     my $text  = $dom->at(".content")->all_text;
  44.  
  45.     $text =~s/(\d+、)/\n$1/g;
  46.     $text =~s/\Q$title\E//;
  47.     $text =~s/[\r\n]+/\n/g;
  48.     $text =~s/^\n//;
  49.  
  50.     my $str;
  51.     $str  = sprintf "%s\n", encode('gbk', $title );
  52.     $str .= sprintf "%s\n", $file;
  53.     $str .= sprintf "%s\n", encode('gbk', $text);
  54.     return $str;
  55. }

头像
523066680
Administrator
Administrator
帖子: 406
注册时间: 2016年07月19日 12:14
拥有现金: 锁定
储蓄: 锁定
Has thanked: 39 times
Been thanked: 53 times
联系:

Re: [Perl]抓取句子大全的网页

帖子 #2 523066680 » 2018年10月06日 09:33

这个网站每个条目下的文章列表并不是完整列表,有很多是没有直接展示的。

头像
523066680
Administrator
Administrator
帖子: 406
注册时间: 2016年07月19日 12:14
拥有现金: 锁定
储蓄: 锁定
Has thanked: 39 times
Been thanked: 53 times
联系:

按页码抓取HTML,多线程

帖子 #3 523066680 » 2018年10月06日 09:34

全部抓下来差不多3个G

Code: [全选] [展开/收缩] [Download] (Get_html_threads.pl)
  1. =info
  2.     523066680/vicyang
  3.     2018-10
  4. =cut
  5.  
  6. use Modern::Perl;
  7. use utf8;
  8. use Encode;
  9. use File::Path;
  10. use File::Slurp;
  11. use LWP::UserAgent;
  12. use Mojo::DOM;
  13. use threads;
  14. use threads::shared;
  15. use Time::HiRes qw/sleep time/;
  16. STDOUT->autoflush(1);
  17.  
  18. our $idx = 0;
  19. our @ths;
  20. our $iter   :shared;
  21. our $failed :shared;
  22.  
  23. our $main = "http://www.1juzi.com";
  24. our $wdir = encode('gbk', "D:/temp/句子大全_byNo.");
  25. mkpath $wdir unless -e $wdir;
  26. chdir $wdir;
  27.  
  28. $iter = 1;
  29. $failed = 0;
  30. #创建线程
  31. grep { push @ths, threads->create( \&func, $_ ) } ( 0 .. 3 );
  32.  
  33. while ( $failed <= 5 ) { sleep 1.0; }
  34.  
  35. #线程终结和分离
  36. grep { $_->detach() } threads->list(threads::all);
  37.  
  38. sub func
  39. {
  40.     our ($main, $failed, $iter);
  41.     my $idx = shift;
  42.     my ($link, $file);
  43.     my $ua = LWP::UserAgent->new( keep_alive => 1, timemout => 6 );
  44.  
  45.     $SIG{'BREAK'} = sub { threads->exit() };
  46.  
  47.     my $res;
  48.     my $retry;
  49.     my $task;
  50.  
  51.     while (1)
  52.     {
  53.         {
  54.             lock($iter);
  55.             $task = $iter++;
  56.         }
  57.  
  58.         $link = "${main}/new/${task}.html";
  59.         $file = "${task}.html";
  60.         if ( -e $file ) { printf "%s exists\n", $file; next; }
  61.  
  62.         printf "%s\n", $file;
  63.         $retry = 0;
  64.         do
  65.         {
  66.             $res = $ua->get( $link );
  67.             if ($retry > 0)
  68.             {
  69.                 printf "[%d]%s%s, retry times: %d\n", $idx, "  "x($idx+1), $file, $retry;
  70.                 sleep 0.5;
  71.             }
  72.             $retry++;
  73.         }
  74.         until ( $res->is_success() or ($retry > 3) );
  75.  
  76.         if ( $res->is_success ) { write_file( $file, $res->content ); }
  77.         else { $failed++; }
  78.     }
  79. }


回到 “Perl”

在线用户

用户浏览此论坛: 没有注册用户 和 1 访客