```python import requests from bs4 import BeautifulSoup import csv import time from selenium import webdriver from pyecharts import Scatter # 定义函数来爬取单个关键词的文献信息 def crawl_single_keyword(keyword, num): driver = webdriver.Chrome() # 请确保已安装 Chrome 浏览器和相应驱动 url = f'http://xueshu.baidu.com/s?wd={keyword}&pn=0' driver.get(url) data = [] for i in range(num // 10): soup = BeautifulSoup(driver.page_source, 'html.parser') articles = soup.find_all('div', class_='sc_content') for article in articles: title = article.find('h3', class_='t c_font').text author = article.find('div', class_='sc_info').text.split('-')[0].strip() abstract = article.find('div', class_='c_font').text keywords = article.find('div', class_='sc_tags').text link = article.find('h3', class_='t c_font').find('a')['href'] source_journal = article.find('div', class_='sc_info').text.split('-')[1].strip() publish_year = article.find('div', class_='sc_info').text.split('-')[2].split(' ')[0].strip() citation = article.find('div', class_='sc_cite_cont').text doi = '' # 可能需要进一步解析获取 data.append([title, author, abstract, keywords, link, source_journal, publish_year, citation, doi]) next_button = driver.find_element_by_link_text('下一页') next_button.click() time.sleep(2) # 适当延迟,避免被反爬 driver.quit() return data # 定义函数来爬取两个关键词的文献信息 def crawl_double_keywords(keyword1, keyword2, num): data1 = crawl_single_keyword(keyword1, num) data2 = crawl_single_keyword(keyword2, num) return data1 + data2 # 爬取数据 data1 = crawl_single_keyword('数字经济', 100) data2 = crawl_double_keywords('数字技术', '数字化', 100) data = data1 + data2 # 保存数据到 CSV 文件 with open('academic_data.csv', 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['文献名称', '作者', '', '关键词', '文献链接', '来源期刊', '发表年份', '引用量', 'doi']) writer.writerows(data) # 统计各年份文献数量 year_count = {} for row in data: year = row[6] if year in year_count: year_count[year] += 1 else: year_count[year] = 1 # 获取最近 10 个年份 recent_years = sorted(year_count.keys(), reverse=True)[:10] # 准备绘图数据 x = [year for year in recent_years] y = [year_count[year] for year in recent_years] # 绘制散点图 scatter = Scatter("文献数量分布") scatter.add("", x, y) scatter.render() # 额外注意点: # 1. 百度学术可能有反爬虫机制,注意爬取频率和策略,避免被封禁。 # 2. 网页结构可能会变化,代码中的解析部分可能需要根据实际情况调整。 # 3. 对于一些复杂的网页元素获取,可能需要更细致的处理逻辑。 ``` [2024-12-10 03:59:45 | AI写代码神器 | 858点数解答]