获取拉勾指定地区职位的小脚本

最近公司没啥事了,想跳槽了,不过处境比较尴尬,刚出来混几个月,技术又很一般,有点难找。
感觉拉勾网在互联网行业招聘方面做的不错,不过目测地区招聘没有具体筛选到哪个区域,比如只有北京市的,却没法精确到海淀区的。还是想在学校附近找个工作吧,就写了个脚本筛出来所有在海淀区的python岗位,思路比较简单,查询请求,然后遍历列表页面拿到所有的职位id号,再 请求所有这些id的职位页面,每个页面可以得到位置信息,抠出来地理位置符合要求就输出这个url。

还是先上以前写过的一个异步小爬虫,网页多的时候速度比较快。async_spider.py

#!/usr/bin/env python
# -*- coding:utf-8 -*-

import time
import logging
from datetime import timedelta
from tornado import httpclient, gen, ioloop, queues
import traceback
from bs4 import BeautifulSoup


def get_logger(name):
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(name)
    return logger


class AsyncSpider(object):
    """A simple class of asynchronous spider."""
    def __init__(self, urls, concurrency=10, results=None, **kwargs):
        self.concurrency = concurrency
        self._q = queues.Queue()
        self._fetching = set()
        self._fetched = set()
        if results is None:
            self.results = []
        for url in urls:
            self._q.put(url)
        self.logger = get_logger(self.__class__.__name__)
        httpclient.AsyncHTTPClient.configure(
            "tornado.curl_httpclient.CurlAsyncHTTPClient"
        )

    def fetch(self, url, **kwargs):
        fetch = getattr(httpclient.AsyncHTTPClient(), 'fetch')
        http_request = httpclient.HTTPRequest(url, **kwargs)
        return fetch(http_request, raise_error=False)

    def handle_html(self, url, html):
        """处理html页面"""
        print(url)

    def handle_response(self, url, response):
        """处理http响应,对于200响应码直接处理html页面,
        否则按照需求处理不同响应码"""
        if response.code == 200:
            self.handle_html(url, response.body)

        elif response.code == 599:    # retry
            self._fetching.remove(url)
            self._q.put(url)

    @gen.coroutine
    def get_page(self, url):
        # yield gen.sleep(10)    # sleep when need
        try:
            response = yield self.fetch(url)
            self.logger.debug('######fetched %s' % url)
        except Exception as e:
            self.logger.debug('Exception: %s %s' % (e, url))
            raise gen.Return(e)
        raise gen.Return(response)    # py3 can just return response

    @gen.coroutine
    def _run(self):
        @gen.coroutine
        def fetch_url():
            current_url = yield self._q.get()
            try:
                if current_url in self._fetching:
                    return

                self.logger.debug('fetching****** %s' % current_url)
                self._fetching.add(current_url)

                response = yield self.get_page(current_url)
                self.handle_response(current_url, response)    # handle reponse

                self._fetched.add(current_url)

            finally:
                self._q.task_done()

        @gen.coroutine
        def worker():
            while True:
                yield fetch_url()

        # Start workers, then wait for the work queue to be empty.
        for _ in range(self.concurrency):
            worker()

        yield self._q.join(timeout=timedelta(seconds=300000))

        try:
            assert self._fetching == self._fetched
        except AssertionError:    # some http error not handle
            print(self._fetching-self._fetched)
            print(self._fetched-self._fetching)

    def run(self):
        io_loop = ioloop.IOLoop.current()
        io_loop.run_sync(self._run)

接下来打开浏览器开发者工具,我们筛选拉勾的面试信息职位的时候注意观察发送的请求,然后看到请求之后,chrome里边右键点击这个请求,然后选择copy as curl就可以得到一个curl请求字符串到剪贴板,直接copy到终端里边就可以执行,然后就可以看到拉勾返回的请求数据了。但是shell不是很熟悉,就写个python小脚本把这个字符串转成requests库需要的参数。

#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""
chrome有个功能,对于请求可以直接右键copy as curl,然后在命令行里边用curl
模拟发送请求。现在需要把此curl字符串处理成requests库可以传入的参数格式,
http://stackoverflow.com/questions/23118249/whats-the-difference-between-request-payload-vs-form-data-as-seen-in-chrome
"""

import json
import re
import requests


def encode_to_dict(encoded_str):
    """ 将encode后的数据拆成dict
    >>> encode_to_dict('name=foo')
    {'name': foo'}
    >>> encode_to_dict('name=foo&val=bar')
    {'name': 'foo', 'val': 'var'}
    """

    pair_list = encoded_str.split('&')
    d = {}
    for pair in pair_list:
        if pair:
            key = pair.split('=')[0]
            val = pair.split('=')[1]
            d[key] = val
    return d


def parse_curl_str(s):
    """convert chrome curl string to url, headers dict and data"""
    pat = re.compile("'(.*?)'")
    str_list = [i.strip() for i in re.split(pat, s)]   # 拆分curl请求字符串

    url = ''
    headers = {}
    data = ''

    for i in range(0, len(str_list)-1, 2):
        arg = str_list[i]
        string = str_list[i+1]

        if arg.startswith('curl'):
            url = string

        elif arg.startswith('-H'):
            header_key = string.split(':', 1)[0].strip()
            header_val = string.split(':', 1)[1].strip()
            headers[header_key] = header_val

        elif arg.startswith('--data'):
            data = string

    return url, headers, data


def test_lagou():
    lagou_str = """curl 'http://www.lagou.com/jobs/positionAjax.json?gj=1-3%E5%B9%B4&px=default&city=%E5%8C%97%E4%BA%AC' -H 'Cookie: user_trace_token=20150911115414-e35eaafdf3cd430fb0a9fed4ca568273; LGUID=20150911115415-c53a987d-5838-11e5-8fa5-525400f775ce; fromsite=www.baidu.com; LGMOID=20160112143105-A2EDC0F26EF4FF9F7A0E261E95FFC0D5; tencentSig=5171360768; JSESSIONID=0F7B9502EFBBC658FD043C42196C5F58; PRE_UTM=; PRE_HOST=; PRE_SITE=http%3A%2F%2Fwww.lagou.com%2Fjobs%2F1018226.html; PRE_LAND=http%3A%2F%2Fwww.lagou.com%2Fjobs%2F1018226.html; login=true; unick=%E7%8E%8B%E5%AE%81%E5%AE%81-Python%E5%BA%94%E8%81%98; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=77; SEARCH_ID=c70df91703ee4c1ca380d883e93dde6c; index_location_city=%E5%8C%97%E4%BA%AC; _gat=1; HISTORY_POSITION=1326282%2C9k-18k%2C%E4%BB%80%E4%B9%88%E5%80%BC%E5%BE%97%E4%B9%B0%2CPython%7C1247829%2C8k-16k%2CPair%2CPython%7C1162119%2C8k-15k%2C%E5%A4%A7%E7%A0%81%E7%BE%8E%E8%A1%A3%2CPython%E5%B7%A5%E7%A8%8B%E5%B8%88%7C411250%2C10k-20k%2C%E6%9C%89%E5%BA%B7%E7%88%B1%E5%B8%AE%2CPython%20%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%7C1269616%2C12k-20k%2CE%E7%98%A6%E7%BD%91%2CPython%7C; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1452172939,1452231058,1452231062,1452580269; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1453095567; LGSID=20160118132416-b7c3fc3c-bda3-11e5-8bf5-5254005c3644; LGRID=20160118133926-d61df46a-bda5-11e5-8a39-525400f775ce; _ga=GA1.2.878965075.1441943655' -H 'Origin: http://www.lagou.com' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' -H 'Accept: application/json, text/javascript, */*; q=0.01' -H 'Referer: http://www.lagou.com/jobs/list_Python?gj=1-3%E5%B9%B4&px=default&city=%E5%8C%97%E4%BA%AC' -H 'X-Requested-With: XMLHttpRequest' -H 'Connection: keep-alive' --data 'first=false&pn=9&kd=Python' --compressed"""
    url, headers, data = parse_curl_str(lagou_str)
    r = requests.post(url, data=data, headers=headers)  # loads
    print(r.content)


if __name__ == '__main__':
    test_lagou()

里边这个噁心的lagou_str就是在chrome里得到的字符串,现在把它转成url,headers和post数据三部分。其实这个copy as curl功能还有不少用途,比如直接把这个字符串复制到终端里,前面加上个repeat命令,这就是个发帖机器人啊。或者有些js网站直接拿到这个请求后模拟就可以抓到一些ajax请求数据了,扯远了。。。
拆分后,就可以用requests库的post方法发送请求了,data就是parse后得到的post数据。
剩下的工作就是请求得到所有页面,然后拿到符合要求的职位链接后,再分别请求每个网页,从每个职位网页中得到需要的位置信息,看看符合要求不。

#!/usr/bin/env python
# -*- coding:utf-8 -*-

"""自己投简历用的,用来拿到lagou职位符合预期地理位置的职位信息"""

import _env
import re
import json
import requests
from bs4 import BeautifulSoup
from async_spider import AsySpider
from web_util import parse_curl_str
from functools import wraps


def retry(times=3):
    """requests retry decorator"""
    def _retry(func):
        @wraps(func)
        def _wrapper(*args, **kwargs):
            index = 0
            while index < times:
                index += 1
                try:
                    response = func(*args, **kwargs)
                    if response.status_code != 200:
                        print('retry', index)
                        continue
                    else:
                        break
                except Exception as e:
                    print(e)
                    response = None
            return response
        return _wrapper
    return _retry


@retry(3)
def fetch_json(url, data, headers):
    return requests.post(url, data=data, headers=headers)  # use data


def parse_json(s):
    """拿到每页请求得到的职位列表"""
    content = json.loads(s).get('content')
    result = content.get('result')
    for each in result:
        yield each.get('positionId')


def get_all_urls():
    res = []
    for page in range(1, 20):
        # 用chrome右键copy as curl复制得到字符串,按照需求修改参数
        lagou_str = """curl 'http://www.lagou.com/jobs/positionAjax.json?gj=1-3%E5%B9%B4&px=default&city=%E5%8C%97%E4%BA%AC' -H 'Cookie: user_trace_token=20150911115414-e35eaafdf3cd430fb0a9fed4ca568273; LGUID=20150911115415-c53a987d-5838-11e5-8fa5-525400f775ce; fromsite=www.baidu.com; LGMOID=20160112143105-A2EDC0F26EF4FF9F7A0E261E95FFC0D5; tencentSig=5171360768; JSESSIONID=0F7B9502EFBBC658FD043C42196C5F58; PRE_UTM=; PRE_HOST=; PRE_SITE=http%3A%2F%2Fwww.lagou.com%2Fjobs%2F1018226.html; PRE_LAND=http%3A%2F%2Fwww.lagou.com%2Fjobs%2F1018226.html; login=true; unick=%E7%8E%8B%E5%AE%81%E5%AE%81-Python%E5%BA%94%E8%81%98; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=77; SEARCH_ID=c70df91703ee4c1ca380d883e93dde6c; index_location_city=%E5%8C%97%E4%BA%AC; _gat=1; HISTORY_POSITION=1326282%2C9k-18k%2C%E4%BB%80%E4%B9%88%E5%80%BC%E5%BE%97%E4%B9%B0%2CPython%7C1247829%2C8k-16k%2CPair%2CPython%7C1162119%2C8k-15k%2C%E5%A4%A7%E7%A0%81%E7%BE%8E%E8%A1%A3%2CPython%E5%B7%A5%E7%A8%8B%E5%B8%88%7C411250%2C10k-20k%2C%E6%9C%89%E5%BA%B7%E7%88%B1%E5%B8%AE%2CPython%20%E5%BC%80%E5%8F%91%E5%B7%A5%E7%A8%8B%E5%B8%88%7C1269616%2C12k-20k%2CE%E7%98%A6%E7%BD%91%2CPython%7C; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1452172939,1452231058,1452231062,1452580269; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1453095567; LGSID=20160118132416-b7c3fc3c-bda3-11e5-8bf5-5254005c3644; LGRID=20160118133926-d61df46a-bda5-11e5-8a39-525400f775ce; _ga=GA1.2.878965075.1441943655' -H 'Origin: http://www.lagou.com' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: zh-CN,zh;q=0.8,en-US;q=0.6,en;q=0.4' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36' -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' -H 'Accept: application/json, text/javascript, */*; q=0.01' -H 'Referer: http://www.lagou.com/jobs/list_Python?gj=1-3%E5%B9%B4&px=default&city=%E5%8C%97%E4%BA%AC' -H 'X-Requested-With: XMLHttpRequest' -H 'Connection: keep-alive' --data 'first=false&pn={0}&kd=Python' --compressed""".format(page)
        url, headers, data = parse_curl_str(lagou_str)
        r = fetch_json(url, data=data, headers=headers)  # loads
        if r and r.status_code == 200:
            res.extend(parse_json(r.content))

    position_url = 'http://www.lagou.com/jobs/%s.html'
    return (position_url % str(_id) for _id in res)


class PositionPageSpider(AsySpider):
    def handle_html(self, url, html):
        soup = BeautifulSoup(html, 'lxml')
        position = re.search(r"positionAddress = '(.*?)'", html).group(1)
        print(position)
        if '海淀' in position:    # 输出url
            print(url)


def test_get_all_urls():
    """use py.test run this function"""
    res = list(get_all_urls())
    for i in res:
        print(i)
    print(len(res))


if __name__ == '__main__':
    urls = list(get_all_urls())
    s = PositionPageSpider(urls)
    s.run()

这个就是用来筛选的,因为职位页面比较少,就直接用个循环请求得到每个页面的职位json数据,从里边拿到id号,拿到所有20个页面的之后,把这几个百个url列表拼起来给异步爬虫,爬虫抓到网页得到html后解析,碰到符合要求的页面就输出这个职位的url。注意拿列表的时候可能失败,所以用了个decorator,失败重试几次。
最后,一个一个打开比较麻烦,把上边得到的url重定向到一个文件url.txt,然后可以这么搞:

import webbrowser


def main():
    with open('./url.txt', 'r') as f:
        for line in f:
            raw_input()    # input() in py3
            if line.strip():
                print(line)
                webbrowser.open_new_tab(line.strip())


if __name__ == '__main__':
    main()

好啦,运行后,海淀区的python职位运行一下就自动用浏览器打开了,输入一个回车打开一个。。。。。。感觉真是闲滴*疼了。有海淀附近招python小弟的可以联系我,虽然水平比较菜也没啥经验,不过好调教。。。吃得少干得不少( ´◔ ‸◔’)