安装 ZooKeeper

Requirements

  • Java 8

cd /opt

wget http://mirrors.hust.edu.cn/apache/zookeeper/stable/zookeeper-3.4.7.tar.gz

tar -zvxf zookeeper-3.4.7.tar.gz

mv zookeeper-3.4.7 zookeeper

cd zookeeper

cd conf

cp zoo_sample.cfg zoo.cfg

vi zoo.cfg

tickTime=2000
initLimit=10
syncLimit=5
dataDir=/home/hadoop/zookeeper/export
clientPort=2181
#server.1=zoo1:2888:3888
#server.2=zoo2:2888:3888
#server.3=zoo3:2888:3888

Install Kafka

cd /opt

wget http://mirrors.hust.edu.cn/apache/kafka/0.9.0.0/kafka-0.9.0.0-src.tgz

tar -zvxf kafka-0.9.0.0-src.tgz

mv kafka-0.9.0.0 kafka

Install Hadoop

cd /opt

wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz

tar -zvxf hadoop-2.7.1.tar.gz

mv hadoop-2.7.1 hadoop

cd hadoop

vi etc/hadoop/hdfs-site.xml

        
                dfs.datanode.max.transfer.threads
                4096
        
        
                dfs.replication
                1
        
        
                dfs.name.dir
                file:///opt/hadoop/hadoopinfra/hdfs/namenode
        
        
                dfs.data.dir
                file:///opt/hadoop/hadoopinfra/hdfs/datanode
        

vi etc/hadoop/hadoop-env.sh

export JAVA_HOME=/usr

vi etc/hadoop/core-site.xml


   
      fs.default.name
      hdfs://master:9000
   


vi etc/hadoop/core-site.xml

   
      fs.default.name
      hdfs://localhost:9000
   

vi etc/hadoop/yarn-site.xml

        
                yarn.nodemanager.aux-services
                mapreduce_shuffle
        

cp etc/hadoop/mapred-site.xml.template etc/hadoop/mapred-site.xml

vi etc/hadoop/mapred-site.xml

        
                mapreduce.framework.name
                yarn
        

Initial format of namenode - format will erase any existing data

hdfs namenode -format

Install HBase

cd /opt

wget http://mirrors.hust.edu.cn/apache/hbase/stable/hbase-1.1.2-src.tar.gz

tar -zvxf hbase-1.1.2-src.tar.gz

mv hbase-1.1.2 hbase

cd hbase

vi conf/hbase-env.sh

export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export HBASE_REGIONSERVERS=${HBASE_HOME}/conf/regionservers
export HBASE_MANAGES_ZK=false

# Configure PermSize. Only needed in JDK7\. You can safely remove it for JDK8+
#export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m"
#export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m"

vi conf/hbase-site.xml


  
    hbase.zookeeper.quorum
    master,data2,data3
    The directory shared by RegionServers.
    
  
  
    hbase.zookeeper.property.dataDir
    /home/hadoop/zookeeper/export
    Property from ZooKeeper config zoo.cfg.
    The directory where the snapshot is stored.
    
  
  
    hbase.rootdir
    hdfs://master:9000/home/hadoop/hbase
    The directory shared by RegionServers.
    
  
  
    hbase.cluster.distributed
    true
    The mode the cluster will be in. Possible values are
      false: standalone and pseudo-distributed setups with managed Zookeeper
      true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh)
    
  


vi conf/regionservers

data2
data3

ENV

vi ~/.bashrc

如果是用yum安装了jdk1.8,那就不要配置 export JAVA_HOME=/opt/jdk1.8.0_40

#HADOOP VARIABLES START
#export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export JAVA_HOME=/opt/jdk1.8.0_40
export HADOOP_INSTALL=/opt/hadoop
export PATH=$PATH:$HADOOP_INSTALL/bin
export PATH=$PATH:$HADOOP_INSTALL/sbin
export HADOOP_MAPRED_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_HOME=$HADOOP_INSTALL
export HADOOP_HDFS_HOME=$HADOOP_INSTALL
export YARN_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_INSTALL/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_INSTALL/lib/native"
export HADOOP_HOME=$HADOOP_INSTALL
#HADOOP VARIABLES END

#HBASE VARIABLES
export HBASE_HOME=/opt/hbase
export HBASE_CONF=$HBASE_HOME/conf
export CLASSPATH=$CLASSPATH:$BASE_HOME/lib/*
#HBASE VARIABLES END

export PATH=$PATH:$HBASE_HOME/bin

export CQLSH_HOST=127.0.0.1
export CQLSH_PORT=9042

source ~/.bashrc

Additional installs

yum install thrift

yum install snappy-devel

pip install sqlalchemy

pip install zmq

pip install pyzmq

Install Distributed Frontera from GIT - Recommended method

cd /opt

git clone https://github.com/scrapinghub/distributed-frontera.git

pip install /opt/distributed-frontera

Install Distributed Frontera with PIP

pip install distributed-frontera

pip install hbase-thrift

pip install PyHBase

required:

happybase, kafka-python, msgpack-python, python-snappy, frontera, thrift

firewall tweaking

sudo firewall-cmd --zone=public --add-port=2181/tcp --permanent

sudo firewall-cmd --zone=public --add-port=60000/tcp --permanent

sudo firewall-cmd --zone=public --add-port=9000/tcp --permanent

sudo firewall-cmd --zone=public --add-port=9000/tcp --permanent

sudo firewall-cmd --reload

start and stop services

hadoop

/opt/hadoop/sbin/start-dfs.sh

/opt/hadoop/sbin/start-yarn.sh

/opt/hadoop/sbin/stop-dfs.sh

/opt/hadoop/sbin/stop-yarn.sh

zookeeper

/opt/zookeeper/bin/zkServer.sh start

/opt/zookeeper/bin/zkServer.sh stop

view zookeeper

/opt/zookeeper/bin/zkCli.sh -server 127.0.0.1:2181

hbase

/opt/hbase/bin/hbase-daemon.sh start master

/opt/hbase/bin/hbase-daemon.sh start regionserver

/opt/hbase/bin/hbase-daemon.sh stop master

/opt/hbase/bin/hbase-daemon.sh stop regionserver

thrift for hbase

hbase thrift start

hbase thrift -p 7777 start

kafka

/opt/kafka/bin/kafka-server-start.sh /opt/kafka/config/server.properties

Verify services are running

jps

must have these running

25571 HMaster
25764 HRegionServer
26420 Main
25110 DataNode
26519 Jps
24968 NameNode
14988 QuorumPeerMain
25310 SecondaryNameNode

sample 

https://github.com/scrapinghub/distributed-frontera//blob/master/docs/source/topics/quickstart.rst

/opt/hbase/bin/hbase shell

create_namespace ‘crawler’

quit

cd /var/www/html

git clone https://github.com/sibiryakov/general-spider.git

cd general-spider

vi frontier/workersettings.py

=== replace content ===
# -*- coding: utf-8 -*-
from frontera.settings.default_settings import *
#from distributed_frontera.settings.default_settings import MIDDLEWARES
from distributed_frontera.settings import default_settings

MAX_REQUESTS = 0
MAX_NEXT_REQUESTS = 128     # Size of batch to generate per partition, should be consistent with
                            # CONCURRENT_REQUESTS in spider. General recommendation is 5-7x CONCURRENT_REQUESTS
CONSUMER_BATCH_SIZE = 512   # Batch size for updates to backend storage
NEW_BATCH_DELAY = 30.0      # This cause spider to wait for specified time, after getting empty response from
                            # backend

#--------------------------------------------------------
# Url storage
#--------------------------------------------------------
BACKEND = 'distributed_frontera.backends.hbase.HBaseBackend'
HBASE_DROP_ALL_TABLES = False
HBASE_THRIFT_PORT = 9090
HBASE_THRIFT_HOST = 'localhost'
HBASE_QUEUE_PARTITIONS = 2  # Count of spider instances
STORE_CONTENT = True

MIDDLEWARES.extend([
    'frontera.contrib.middlewares.domain.DomainMiddleware',
    'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware'
])

KAFKA_LOCATION = 'localhost:9092'
FRONTIER_GROUP = 'scrapy-crawler'
INCOMING_TOPIC = 'frontier-done'    # Topic used by spiders where to send fetching results
OUTGOING_TOPIC = 'frontier-todo'    # Requests that needs to be downloaded is written there
SCORING_GROUP = 'scrapy-scoring'
SCORING_TOPIC = 'frontier-score'    # Scores provided by strategy worker using this channel and read by storage
                                    # worker.

#--------------------------------------------------------
# Logging
#--------------------------------------------------------
LOGGING_EVENTS_ENABLED = False
LOGGING_MANAGER_ENABLED = True
LOGGING_BACKEND_ENABLED = True
LOGGING_DEBUGGING_ENABLED = False

vi frontier/spider_settings.py

=== replace content ===

# -*- coding: utf-8 -*-
from frontera.settings.default_settings import *
#from distributed_frontera.settings.default_settings import MIDDLEWARES
from distributed_frontera.settings import default_settings

SPIDER_PARTITION_ID = 0                 # Partition ID assigned
MAX_NEXT_REQUESTS = 256                 # Should be consistent with MAX_NEXT_REQUESTS set for Frontera worker
DELAY_ON_EMPTY = 5.0

MIDDLEWARES.extend([
    'frontera.contrib.middlewares.domain.DomainMiddleware',
    'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware'
])

#--------------------------------------------------------
# Crawl frontier backend
#--------------------------------------------------------
BACKEND = 'distributed_frontera.backends.remote.KafkaOverusedBackend'
KAFKA_LOCATION = 'localhost:9092'       # Your Kafka service location
SPIDER_PARTITION_ID = 0                 # Partition ID assigned
HBASE_NAMESPACE = 'crawler'

#--------------------------------------------------------
# Logging
#--------------------------------------------------------
LOGGING_ENABLED = True
LOGGING_EVENTS_ENABLED = False
LOGGING_MANAGER_ENABLED = False
LOGGING_BACKEND_ENABLED = False
LOGGING_DEBUGGING_ENABLED = False

open new terminal -> start ZeroMQ broker

cd /var/www/html/general-spider

python -m distributed_frontera.messagebus.zeromq.broker

open new terminal -> start DB worker

cd /var/www/html/general-spider

python -m distributed_frontera.worker.main --config frontier.workersettings

open new terminal -> start strategy worker

cd /var/www/html/general-spider

python -m distributed_frontera.worker.score --config frontier.strategy0 --strategy distributed_frontera.worker.strategy.bfs

open new terminal -> Starting the spiders

cd /var/www/html/general-spider

scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider0 -s SEEDS_SOURCE=seeds_es_dmoz.txt

scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider1

注意:

启动顺序为 hadoop -> ZooKeeper -> HBase

关闭顺序为 HBase -> ZooKeeper -> hadoop

你可能感兴趣的:(安装 ZooKeeper)