koulab

技術系のメモ

Squidでスクレイピング用に複数IPを持ったRotateProxyサーバを作成する

なにこれ

マルチスレッドで高速でスクレイピングするときにソフトウェア側で串の制御をしないで済む

構成

接続サーバ(親squid):3128 <-> 子squidサーバ:1024-65535 <-> 接続先サイト といった感じでsquidのサーバを2つ用意します。 必要に応じて子squidサーバに直接IP指定して接続することもできるし、接続サーバに繋いでIPを勝手にローテーションすることもできる(acl次第)

EXTENDED BODY:

前提

NICに複数のIPアドレスが付与されているものとします 実稼働しているグローバルIPをそのまま載せているので適宜自分のものに変更してください 以下のIPアドレスを使用しています

107.174.101.43 107.174.101.44 107.174.101.52 107.174.101.53 107.174.101.54 107.174.101.45 107.174.101.46 107.174.101.47 107.174.101.48 107.174.101.49 107.174.101.5 107.174.101.50 107.174.101.51

# ip addr show
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN
    link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
    inet 127.0.0.1/8 scope host lo
    inet6 ::1/128 scope host
       valid_lft forever preferred_lft forever
2: venet0: <BROADCAST,POINTOPOINT,NOARP,UP,LOWER_UP> mtu 1500 qdisc noqueue state UNKNOWN
    link/void
    inet 127.0.0.1/32 scope host venet0
    inet 107.174.101.43/32 brd 107.174.101.43 scope global venet0:0
    inet 107.174.101.44/32 brd 107.174.101.44 scope global venet0:1
    inet 107.174.101.52/32 brd 107.174.101.52 scope global venet0:10
    inet 107.174.101.53/32 brd 107.174.101.53 scope global venet0:11
    inet 107.174.101.54/32 brd 107.174.101.54 scope global venet0:12
    inet 107.174.101.45/32 brd 107.174.101.45 scope global venet0:2
    inet 107.174.101.46/32 brd 107.174.101.46 scope global venet0:3
    inet 107.174.101.47/32 brd 107.174.101.47 scope global venet0:4
    inet 107.174.101.48/32 brd 107.174.101.48 scope global venet0:5
    inet 107.174.101.49/32 brd 107.174.101.49 scope global venet0:6
    inet 107.174.101.5/32 brd 107.174.101.5 scope global venet0:7
    inet 107.174.101.50/32 brd 107.174.101.50 scope global venet0:8
    inet 107.174.101.51/32 brd 107.174.101.51 scope global venet0:9

yum install squid -y

子squidから設定していきます

子squidサーバ squid.conf

#
# Recommended minimum configuration:
#

# Example rule allowing access from your local networks.
# Adapt to list your (internal) IP networks from where browsing
# should be allowed
acl localnet src 10.0.0.0/8     # RFC1918 possible internal network
acl localnet src 172.16.0.0/12  # RFC1918 possible internal network
acl localnet src 192.168.0.0/16 # RFC1918 possible internal network
acl localnet src fc00::/7       # RFC 4193 local private network range
acl localnet src fe80::/10      # RFC 4291 link-local (directly plugged) machines
acl SSL_ports port 443
acl Safe_ports port 80          # http
acl Safe_ports port 21          # ftp
acl Safe_ports port 443         # https
acl Safe_ports port 70          # gopher
acl Safe_ports port 210         # wais
acl Safe_ports port 1025-65535  # unregistered ports
acl Safe_ports port 280         # http-mgmt
acl Safe_ports port 488         # gss-http
acl Safe_ports port 591         # filemaker
acl Safe_ports port 777         # multiling http
acl CONNECT method CONNECT

acl ip0 myport 3128
acl ip1 myport 4001
acl ip2 myport 4002
acl ip3 myport 4003
acl ip4 myport 4004
acl ip5 myport 4005
acl ip6 myport 4006
acl ip7 myport 4007
acl ip8 myport 4008
acl ip9 myport 4009
acl ip10 myport 4011
acl ip11 myport 4012
acl ip12 myport 4013
#acl rotate myport 4013
#
# Recommended minimum Access Permission configuration:
#
# Deny requests to certain unsafe ports
http_access deny !Safe_ports

# Deny CONNECT to other than secure SSL ports
http_access deny CONNECT !SSL_ports

# Only allow cachemgr access from localhost
http_access allow localhost manager
http_access deny manager

# We strongly recommend the following be uncommented to protect innocent
# web applications running on the proxy server who think the only
# one who can access services on "localhost" is a local user
#http_access deny to_localhost

#
# INSERT YOUR OWN RULE(S) HERE TO ALLOW ACCESS FROM YOUR CLIENTS
#

# Example rule allowing access from your local networks.
# Adapt localnet in the ACL section to list your (internal) IP networks
# from where browsing should be allowed
http_access allow localnet
http_access allow localhost

# And finally deny all other access to this proxy
http_access deny all

# Squid normally listens to port 3128
http_port 3128


http_port 4001
http_port 4002
http_port 4003
http_port 4004
http_port 4005
http_port 4006
http_port 4007
http_port 4008
http_port 4009
http_port 4010
http_port 4011
http_port 4012

# Uncomment and adjust the following to add a disk cache directory.
#cache_dir ufs /var/spool/squid 100 16 256

# Leave coredumps in the first cache dir
coredump_dir /var/spool/squid

#
# Add any of your own refresh_pattern entries above these.
#
refresh_pattern ^ftp:           1440    20%     10080
refresh_pattern ^gopher:        1440    0%      1440
refresh_pattern -i (/cgi-bin/|\?) 0     0%      0
refresh_pattern .               0       20%     4320

tcp_outgoing_address 107.174.101.44 ip1
tcp_outgoing_address 107.174.101.52 ip2
tcp_outgoing_address 107.174.101.53 ip3
tcp_outgoing_address 107.174.101.54 ip4
tcp_outgoing_address 107.174.101.45 ip5
tcp_outgoing_address 107.174.101.46 ip6
tcp_outgoing_address 107.174.101.47 ip7
tcp_outgoing_address 107.174.101.48 ip8
tcp_outgoing_address 107.174.101.49 ip9
tcp_outgoing_address 107.174.101.5 ip10
tcp_outgoing_address 107.174.101.50 ip11
tcp_outgoing_address 107.174.101.51 ip12

acl NOCACHE src all
cache deny NOCACHE
visible_hostname unkown
forwarded_for off
server_persistent_connections off
request_header_access X-Forwarded-For deny all
request_header_access Via deny all
request_header_access Cache-Control deny all
reply_header_access X-Forwarded-For deny all
reply_header_access Via deny all
reply_header_access X-Squid-Error deny all
reply_header_access Cache-Control deny all
server_persistent_connections off
client_persistent_connections off

親squidサーバ squid.config

#
# Recommended minimum configuration:
#

# Example rule allowing access from your local networks.
# Adapt to list your (internal) IP networks from where browsing
# should be allowed
acl localnet src 10.0.0.0/8     # RFC1918 possible internal network
acl localnet src 172.16.0.0/12  # RFC1918 possible internal network
acl localnet src 192.168.0.0/16 # RFC1918 possible internal network
acl localnet src fc00::/7       # RFC 4193 local private network range
acl localnet src fe80::/10      # RFC 4291 link-local (directly plugged) machines
acl SSL_ports port 443
acl Safe_ports port 80          # http
acl Safe_ports port 21          # ftp
acl Safe_ports port 443         # https
acl Safe_ports port 70          # gopher
acl Safe_ports port 210         # wais
acl Safe_ports port 1025-65535  # unregistered ports
acl Safe_ports port 280         # http-mgmt
acl Safe_ports port 488         # gss-http
acl Safe_ports port 591         # filemaker
acl Safe_ports port 777         # multiling http
acl CONNECT method CONNECT

#
# Recommended minimum Access Permission configuration:
#
# Deny requests to certain unsafe ports
http_access deny !Safe_ports

# Deny CONNECT to other than secure SSL ports
http_access deny CONNECT !SSL_ports

# Only allow cachemgr access from localhost
http_access allow localhost manager
http_access deny manager

# We strongly recommend the following be uncommented to protect innocent
# web applications running on the proxy server who think the only
# one who can access services on "localhost" is a local user
#http_access deny to_localhost

#
# INSERT YOUR OWN RULE(S) HERE TO ALLOW ACCESS FROM YOUR CLIENTS
#

# Example rule allowing access from your local networks.
# Adapt localnet in the ACL section to list your (internal) IP networks
# from where browsing should be allowed
http_access allow localnet
http_access allow localhost

# And finally deny all other access to this proxy
http_access deny all

# Squid normally listens to port 3128
http_port 3128

# Uncomment and adjust the following to add a disk cache directory.
#cache_dir ufs /var/spool/squid 100 16 256

# Leave coredumps in the first cache dir
coredump_dir /var/spool/squid

#
# Add any of your own refresh_pattern entries above these.
#
refresh_pattern ^ftp:           1440    20%     10080
refresh_pattern ^gopher:        1440    0%      1440
refresh_pattern -i (/cgi-bin/|\?) 0     0%      0
refresh_pattern .               0       20%     4320


cache_peer 107.174.101.43 parent 3128 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate0
cache_peer 107.174.101.43 parent 4001 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate1
cache_peer 107.174.101.43 parent 4002 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate2
cache_peer 107.174.101.43 parent 4003 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate3
cache_peer 107.174.101.43 parent 4004 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate4
cache_peer 107.174.101.43 parent 4005 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate5
cache_peer 107.174.101.43 parent 4006 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate6
cache_peer 107.174.101.43 parent 4007 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate7
cache_peer 107.174.101.43 parent 4008 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate8
cache_peer 107.174.101.43 parent 4009 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate9
cache_peer 107.174.101.43 parent 4010 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate10
cache_peer 107.174.101.43 parent 4011 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate11
cache_peer 107.174.101.43 parent 4012 0 connect-fail-limit=3 connect-timeout=8 round-robin no-query allow-miss proxy-only name=rotate12

request_header_access X-Forwarded-For deny all
request_header_access Via deny all
request_header_access Cache-Control deny all
reply_header_access X-Forwarded-For deny all
reply_header_access Via deny all
reply_header_access X-Squid-Error deny all
reply_header_access Cache-Control deny all
server_persistent_connections off
client_persistent_connections off
never_direct allow all
acl NOCACHE src all
cache deny NOCACHE
visible_hostname unkown
forwarded_for off