scrapy爬虫项目部署到k8s
2021-03-17
背景:
- 要求定时执行
- 每个task都独立执行,有的并行,有的串行
- 本身并没有使用任务调度框架
解决方案:
创建一个json,ID和task相同,做为key,value包含cron和cmd,其中cmd可以实现串行。jenkins选择对应ID作为入参动态创建启动脚本
tasks.json
{
"balenciaga_screenshots_zh":{
"cron":"0 */1 * * *",
"cmd":"scrapy crawl balenciaga_screenshots_zh"
},
"instagram":{
"cron":"0 */1 * * *",
"cmd":"scrapy crawl instagram"
},
"celine_celine-uk":{
"cron":"*/1 * * * *",
"cmd":"scrapy crawl celine\nscrapy crawl celine-uk"
},
"celine":{
"cron":"3 8 * * *",
"cmd":"scrapy crawl celine"
}
}
创建基础镜像
python37 Dockerfile
依赖的requirement.txt放到Dockerfile同目录
FROM rackspacedot/python37:30
ADD requirements.txt requirements.txt
RUN sed -i s@/archive.ubuntu.com/@/mirrors.aliyun.com/@g /etc/apt/sources.list \
&& apt-get update \
# && apt-get install -y --no-install-recommends iputils-ping net-tools curl \
&& pip install --upgrade pip \
&& pip config set global.index-url http://mirrors.aliyun.com/pypi/simple \
&& pip config set install.trusted-host mirrors.aliyun.com \
&& pip --no-cache-dir install -r requirements.txt --ignore-installed \
&& apt-get autoclean -y \
&& apt-get clean -y \
&& rm -rf ~/.cache/pip/* \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* \
&& cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& echo "Asia/Shanghai" > /etc/timezone
构建镜像
IMAGE_URL=harbor.demo.com/demo/python37-baseimage:2.0.0
docker build --rm --no-cache -t $IMAGE_URL .
scrapy项目Dockerfile
run.sh是pipeline中动态生成的
FROM harbor.demo.com/demo/python37-baseimage:2.0.0
COPY run.sh /app/run.sh
COPY requirements.txt /app/requirements.txt
COPY scrapy_spiders/ /app/scrapy_spiders
RUN cd /app && pip --no-cache-dir install -r requirements.txt --ignore-installed && rm -rf ~/.cache/pip/*
CMD /app/run.sh
WORKDIR /app
创建cronjob模板
cronjob.yml
apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: {CRONJOB}
spec:
schedule: {CRON}
concurrencyPolicy: Forbid
failedJobsHistoryLimit: 1
successfulJobsHistoryLimit: 1
jobTemplate:
spec:
template:
spec:
{NODE_NAME}
imagePullSecrets:
- name: harbor-registry
#restartPolicy: OnFailure
restartPolicy: Never
containers:
- name: {CRONJOB}
image: {IMAGE_URL}
env:
- name: TZ
value: Asia/Shanghai
- name: GROUP
value: {IMAGE_GROUP}
- name: SCRAPY_PROJECT
value: {ENV}
resources:
requests:
memory: "200Mi"
cpu: "50m"
limits:
memory: {LIMIT_MEM}
cpu: {LIMIT_CPU}
创建jenkins pipeline
pipeline {
agent {
label 'master'
}
options {
ansiColor('xterm')
buildDiscarder(logRotator(daysToKeepStr: '1', numToKeepStr: '3'))
}
tools {
maven 'apache_maven_3.5.0'
jdk 'jdk_1.8_202'
git 'git_2.19.1'
dockerTool 'docker_19.03.12'
}
parameters{
listGitBranches(
remoteURL: 'http://gitlab.demo.com/arch/pis-spider.git',
branchFilter: '.*',
tagFilter: '*',
defaultValue: 'master',
name: 'BRANCH_OR_TAG',
type: 'PT_BRANCH_TAG',
selectedValue: 'TOP',
sortMode: 'DESCENDING_SMART',
quickFilterEnabled: 'true',
credentialsId: 'gitlab_username_password_credential')
choice(name:'TASK', choices:'balenciaga_screenshots_zh\nburberry_uk\nceline_celine-uk\nceline\nceline-uk\ndior_fr\ndior_screenshots_zh\ndior_uk\nfendi\nfendi_screenshots_zh\ninstagram\njimmy_choo_screenshots_zh\njimmychoo\nloewe_screenshots_zh\nmaisonmargiela\nmiumiu_uk\nprada\nprada_screenshots_zh\nrogervivier\nthombrowne\nvalentino_it\nzh-gucci')
}
environment {
GIT = 'http://gitlab.demo.com/arch/pis-spider.git'
IMAGE_GROUP = "arch" //对应harbor镜像分组
REPLICAS = 1
TEMPLATE="cronjob.yml"
DOCKERFILE="Dockerfile-py"
CLUSTER = sh(script: """echo ${JOB_BASE_NAME} | awk -F '-' '{if (\$1=="dev") {print "test-cluster"} else {print \$1"-cluster"}}' """, returnStdout: true).trim()
NODE_NAME= 'nodeName: test-devops-k8sslave-4'
// NODE_NAME = " "
LIMIT_MEM="2248Mi"
LIMIT_CPU="1000m"
K8S_NAMESPACE = "${ENV}-${IMAGE_GROUP}"
PROJECT = sh(script: """echo ${GIT} | awk -F '/' '{print \$NF}' | awk -F '.' '{print \$1}' | awk -F '-' '{if (\$1=="${IMAGE_GROUP}") {print \$0} else {print "${IMAGE_GROUP}-"\$0}}'| tr "[:upper:]" "[:lower:]" """, returnStdout: true).trim()
CRONJOB = sh(script: """echo ${PROJECT}-${TASK}| tr "_" "-" """, returnStdout: true).trim()
ENV = sh(script: "echo ${JOB_BASE_NAME} | awk -F '-' '{print \$1}'", returnStdout: true).trim()
NEWENV = sh(script: """echo ${JOB_BASE_NAME} | awk -F '-' '{if (\$1=="test") {print "new"\$1} else {print \$1}}' """, returnStdout: true).trim()
HARBOR_HOST = 'harbor.demo.com'
DOCKER_IMAGE = "${IMAGE_GROUP}/${JOB_BASE_NAME}:${VERSION_VALUE}"
CHECK_TAG = sh(script: "echo ${BRANCH_OR_TAG} | awk -F '/' '{if (\$3) print \$3; else print \$1}'", returnStdout: true).trim() // 分支或tag
VERSION_VALUE = "${CHECK_TAG}-${TIME}" // 分支或tag
TIME = sh(script: "date '+%Y%m%d%H%M%S'", returnStdout: true).trim()
}
stages {
stage ('代码获取') {
steps {
echo "\033[46;30m************************************************ 拉取代码开始 ************************************************\033[0m"
deleteDir() // 清理工作目录
git credentialsId: 'gitlab_username_password_credential', url: "${GIT}"
sh '[ -n "${CHECK_TAG}" ] && git checkout ${CHECK_TAG} || { echo -e "切换至指定的tag的版本,tag:${CHECK_TAG} 不存在或为空,请检查输入的tag!" && exit 111; }'
buildName "${CHECK_TAG}"
echo "\033[46;30m************************************************ 拉取代码结束 ************************************************\033[0m"
}
}
stage ('创建启动脚本') {
steps {
echo "\033[46;30m************************************************ 创建脚本开始 ************************************************\033[0m"
sh 'echo -e "#!/bin/bash\ncd /app/scrapy_spiders/" > run.sh && chmod +x run.sh'
sh "jq -r '.[\"$TASK\"].cmd'<tasks.json>>run.sh"
echo "\033[46;30m************************************************ 创建脚本结束 ************************************************\033[0m"
}
}
stage('镜像构建') {
steps {
echo "\033[46;30m************************************************ 镜像构建开始 ************************************************\033[0m"
script {
sh "/usr/bin/cp -f /data/template/docker/${Dockerfile} Dockerfile"
sh "docker build -t ${HARBOR_HOST}/${DOCKER_IMAGE} ."
sh "docker push ${HARBOR_HOST}/${DOCKER_IMAGE}"
sh "docker rmi ${HARBOR_HOST}/${DOCKER_IMAGE}"
}
echo "\033[46;30m************************************************ 镜像构建结束 ************************************************\033[0m"
}
}
stage('发布服务至kubernetes集群') {
environment {
CRON = sh(script: "jq '.[\"$TASK\"].cron'<tasks.json", returnStdout: true).trim()
}
steps {
script {
echo "\033[46;30m************************************************ 发布服务至kubernetes集群开始 ************************************************\033[0m"
sh "cp /data/template/k8s/${TEMPLATE} ${TEMPLATE}"
sh "sed -i -e 's#{IMAGE_URL}#${HARBOR_HOST}/${DOCKER_IMAGE}#g;s#{ENV}#${ENV}#g;s#{NODE_NAME}#${NODE_NAME}#g;s#{PROJECT}#${PROJECT}#g;s#{CRONJOB}#${CRONJOB}#g;s#{CRON}#${CRON}#g;s#{IMAGE_GROUP}#${IMAGE_GROUP}#g;s#{LIMIT_MEM}#${LIMIT_MEM}#g;s#{LIMIT_CPU}#${LIMIT_CPU}#g;' ${TEMPLATE}"
sh "kubectl --kubeconfig /data/kubecfg/${CLUSTER} cluster-info && kubectl --kubeconfig /data/kubecfg/${CLUSTER} get nodes"
sh "kubectl --kubeconfig /data/kubecfg/${CLUSTER} apply -f ${TEMPLATE} --namespace=${K8S_NAMESPACE}"
echo "\033[46;30m************************************************ 发布服务至kubernetes集群结束 ************************************************\033[0m"
}
}
}
}
}