Warning: array_keys() [function.array-keys]: The first argument should be an array in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 1827
Warning: Invalid argument supplied for foreach() in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 1827
Warning: Invalid argument supplied for foreach() in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 2180
Warning: Invalid argument supplied for foreach() in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 3025
Warning: implode() [function.implode]: Argument to implode must be an array. in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 3077
Warning: array_keys() [function.array-keys]: The first argument should be an array in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 3108
Warning: Invalid argument supplied for foreach() in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 3108
Warning: array_keys() [function.array-keys]: The first argument should be an array in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 3151
Warning: Invalid argument supplied for foreach() in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 3151
Warning: array_keys() [function.array-keys]: The first argument should be an array in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 3292
Warning: Invalid argument supplied for foreach() in /home/thomas/public_html/wp/wp-content/plugins/wp-syntax/geshi/geshi.php on line 3292
以前常用來解析 HTML (對稱式 tag 都可以) 或是 XML 的語法
my @table = $html =~ /<table((?:(?!<table).)*)\/table>/igsx;
@ table : 宣告一個陣列用來儲存比對成功的每筆結果
$html : 一般HTML網頁用來進行 regular expression
/*/ : 比對 /pattern/ igsx 是一些參數
i : 忽略大小寫
g : 全部找,要不然找到一個就會停止繼續找
s : 忽略換行,多行也當作一行
x : 忽略空白
=~ /<table((?:(?!<table).)*)\/table>/igsx; 以 <table 為開始 到 /table> 截止 中間可以是任何字元旦不包含 <table
如此一來如果網頁有很多table 就分別儲存於 $table[0], $table[1], $table[2] ….
所以
XML 語法也可以這樣解析,不需要其他Module!
簡單的比對想法就是 開始到結束之間不包含開始
有時候可能是 開始到結束之間不包含結束
範例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | ############################################################################ # parsencl.pl - description # # ------------------- # # copyright : (C) 2000 by Yu-Chin Tsai # # email : tlinux.tsai@gmail.com # ############################################################################ ############################################################################ # # # This program is free software; you can redistribute it and#or modify # # it under the terms of the GNU General Public License as published by # # the Free Software Foundation; either version 2 of the License, or # # (at your option) any later version. # # # ############################################################################# #!/bin/perl -s use strict; use Encode; use LWP; #save as file open(REC,">ncl.htm"); for(my $i=0;$i<100;$i++){ my $CN=101; #要查詢的索書號 $CN=$CN+$i; $CN = sprintf "%03d",$CN; #取得國x圖x館資料 my $browser = LWP::UserAgent->new; my $url = 'http://xxxx'; #不方便透露 my $response = $browser->post( $url, [ 'CN'=>$CN, #索書號(結尾要加',') 'BR'=>'BS', #典藏地 'PAGELINE'=>'50', #每頁資料筆數 'routine'=>'holding' #必備(最後一個不必加 ',' ) ] ); #以上請參考http://lib.ncl.edu.tw/web/cross.htm 自行新增檢索欄位 warn $CN." ".$response->status_line; #轉big5 to UTF8 my $html = encode("utf8", decode("big5", $response->content)); #拿掉垃圾 $html =~ s/ \;//ig; #print "Source html $html \n\n"; #分析國圖資料html->cvs my @table = $html =~ /<table((?:(?!<table).)*)\/table>/igsx; #這邊是重點 if($#table == 0){ print REC "$CN no data\n"; warn "no data\n$html\n\n"; } #圖書資料在$table[2] my @tr = $table[2] =~ /<tr((?:(?!<tr>).)*)\/tr>/igsx; shift(@tr); foreach(@tr){ $_ =~ s/<//ig; $_ =~ s/>//ig; $_ =~ s/align=left//ig; my @td = $_ =~ /td((?:(?!td).)*)\/td/igsx; $td[3] =~ s/\s+//ig; $td[4] =~ s/\s+//ig; $td[5] =~ s/\s+//ig; $td[6] =~ s/\s\s+//ig; warn "\"$CN\",\"$td[3]\",\"$td[4]\",\"$td[5]\",\"$td[6]\"\n"; print REC "\"$CN\",\"$td[3]\",\"$td[4]\",\"$td[5]\",\"$td[6]\"\n"; } sleep 5; } close REC; |

