Skip to main content
Python 多版本管理工具

管理多个Python版本和它们的虚拟环境对于任何需要在不同项目间切换的开发者来说都是一项基础技能。从官方的 venv 到强大的 pyenv 和其他第三方工具,Python社区提供了一系列的工具来简化这一过程。本文将为你提供一个全面(未来会全的)的指南,帮助你掌握这些工具的使用方法。

conda

venv

venv 是 python 官方在 python 3.3 版本内置的一个标准库模块,用于创建虚拟环境,帮助用户快速创建干净、完全隔离的且不同版本的 python 解释器以便在不同项目中开发。

venv 本身不提供 python 版本的创建,而是直接依赖服务器的 python ,如果想要创建其他版本的 python,需要选择其他版本管理。


MarshioAbout 5 minpythonspider
Python Spider

以下是我的一些爬虫备份

东财

import datetime
import requests
from loguru import logger
from bs4 import BeautifulSoup
from apscheduler.schedulers.blocking import BlockingScheduler

headers = {
    'Accept': '*/*',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-GB;q=0.8,en-US;q=0.7,en;q=0.6',
    'Connection': 'keep-alive',
    'Referer': 'https://data.eastmoney.com/report/industry.jshtml',
    'Sec-Fetch-Dest': 'script',
    'Sec-Fetch-Mode': 'no-cors',
    'Sec-Fetch-Site': 'same-site',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 '
                  'Safari/537.36',
    'sec-ch-ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'Host': 'reportapi.eastmoney.com'
}

records = {
    'day': '',
    '个股研报': 0,
    '行业研报': 0,
    '策略报告': 0,
    '宏观研报': 0,
}


def report():
    return {
        'id': 0,
        'category': '',
        'industry_code': '',
        'industry_name': '',
        'stock_code': '',
        'stock_name': '',
        'publish_date': '',
        'title': '',
        'summary': '',
        'pdf_url': '',
        'author': '',
        'org_code': '',
        'org_name': '',
        'org_s_name': '',
        'em_rating_name': '',
        'url': '',
        'seed_url': '',
        # 个股行业名称
        'indv_indu_name': '',
        # 个股行业code
        'indv_indu_code': '',
    }


def fetch_eastmoney_report(url, category, begin, end, page_no):
    # 请求数据
    response = requests.get(url.format(begin, end, page_no), headers)
    report_list = []
    total_page = response.json()['TotalPage']
    hits = response.json()['hits']
    record = {
        category: hits
    }
    records.update(record)
    logger.info(
        '当前url:{},页数:{},当前页码:{},数量:{}'.format(url.format(begin, end, page_no), total_page, page_no, hits))
    for data in response.json()['data']:
        report_info = report()
        report_info['category'] = category
        report_info['title'] = data['title']
        report_info['seed_url'] = url
        # 个股code
        report_info['stock_code'] = data['stockCode']
        # 个股名称
        report_info['stock_name'] = data['stockName']
        # 个股code
        report_info['industry_code'] = data['industryCode']
        # 个股名称
        report_info['industry_name'] = data['industryName']
        # 二次跳转关键字段
        report_info['info_code'] = data['infoCode']
        # 当前评级
        report_info['em_rating_name'] = data['emRatingName']
        # 作者,作者有多个字段,researcher这个字段是作者的string形式
        report_info['author'] = data['researcher']
        # 机构代码
        report_info['org_code'] = data['orgCode']
        # 机构全称
        report_info['org_name'] = data['orgName']
        report_info['indv_indu_name'] = data['indvInduName']
        report_info['indv_indu_code'] = data['indvInduCode']
        # 机构简称
        report_info['org_s_name'] = data['orgSName']
        # 发布日期
        report_info['publish_date'] = data['publishDate']
        report_list.append(report_info)
    # 释放
    response.close()
    # 进入详情页面爬取信息
    for report_info in report_list:
        # 详情页面
        report_detail_uri = 'https://data.eastmoney.com/report/zw_industry.jshtml?infocode={}' \
            .format(report_info['info_code'])
        detail_html = requests.get(report_detail_uri, headers)

        soup = BeautifulSoup(detail_html.text, 'lxml')
        detail_html.close()
        pdf_url_a = soup.select('a.pdf-link')[0]
        summary = soup.select('div.ctx-content')[0]
        summary_text = summary.get_text()
        pdf_url = pdf_url_a.get('href')
        report_info['url'] = report_detail_uri
        report_info['pdf_url'] = pdf_url
        report_info['summary'] = summary_text
    if page_no < total_page:
        fetch_eastmoney_report(url, category, begin, end, page_no=page_no + 1)


def fetch_eastmoney_jg_report(url, category, begin, end, page_no):
    # 请求数据
    response = requests.get(url.format(begin, end, page_no), headers)
    report_list = []
    total_page = response.json()['TotalPage']
    hits = response.json()['hits']
    logger.info(
        '当前url:{},页数:{},当前页码:{},数量:{}'.format(url.format(begin, end, page_no), total_page, page_no, hits))
    for data in response.json()['data']:
        report_info = report()
        report_info['category'] = category
        report_info['title'] = data['title']
        report_info['seed_url'] = url

        # 二次跳转关键字段
        report_info['encode_url'] = data['encodeUrl']
        # 作者,作者有多个字段,researcher这个字段是作者的string形式
        report_info['author'] = data['researcher']
        # 机构代码
        report_info['org_code'] = data['orgCode']
        # 机构全称
        report_info['org_name'] = data['orgName']
        # 机构简称
        report_info['org_s_name'] = data['orgSName']
        # 发布日期
        report_info['publish_date'] = data['publishDate']

        report_list.append(report_info)
    response.close()
    # 进入详情页面爬取信息
    for report_info in report_list:
        # 详情页面
        report_detail_uri = 'https://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl={}' \
            .format(report_info['encode_url'])
        detail_html = requests.get(report_detail_uri, headers)
        soup = BeautifulSoup(detail_html.text, 'lxml')
        detail_html.close()
        pdf_url_a = soup.select('a.pdf-link')[0]
        summary = soup.select('div.ctx-content')[0]
        pdf_url = pdf_url_a.get('href')
        summary_text = summary.get_text()
        if summary_text.rfind('风险提示'):
            summary_text = summary_text[0:summary_text.find('风险提示')]
        if summary_text.rfind('风险因素'):
            summary_text = summary_text[0:summary_text.rfind('风险因素')]
        report_info['url'] = report_detail_uri
        report_info['pdf_url'] = pdf_url
        report_info['summary'] = summary_text
    if page_no < total_page:
        fetch_eastmoney_jg_report(url, category, begin, end, page_no=page_no + 1)
    pass


def eastmoney_spider():
    now = datetime.datetime.today().strftime('%Y-%m-%d %H:%M:%S')
    today = datetime.datetime.today().strftime('%Y-%m-%d')
    logger.info("定时任务执行 at {}".format(now))
    report_uris = {
        # 行业研报,qType=1代表是行业研报
        '个股研报': 'https://reportapi.eastmoney.com/report/list?pageSize=100&beginTime={}&endTime={}&pageNo={}&qType=0',
        '行业研报': 'https://reportapi.eastmoney.com/report/list?pageSize=100&beginTime={}&endTime={}&pageNo={}&qType=1',
    }

    jg_uris = {
        '策略报告': 'https://reportapi.eastmoney.com/report/jg?pageSize=100&beginTime={}&endTime={}&pageNo={}&qType=2',
        '宏观研报': 'https://reportapi.eastmoney.com/report/jg?pageSize=100&beginTime={}&endTime={}&pageNo={}&qType=3',
    }

    for k, v in report_uris.items():
        fetch_eastmoney_report(v, k, today, today, 1)

    for k, v in jg_uris.items():
        fetch_eastmoney_jg_report(v, k, today, today, 1)


if __name__ == '__main__':
    eastmoney_spider()
    scheduler = BlockingScheduler()
    scheduler.add_job(eastmoney_spider, "cron", day='*', hour='*', minute='0')
    scheduler.start()


MarshioAbout 3 minpythonspider