以下是根据您提供的要求完成的代码示例: 任务一:采集唐诗三百首的内容 ```python import requests from bs4 import BeautifulSoup # 定义函数用于请求网页并解析数据 def get_poems(): url = "https://so.gushiwen.cn/gushi/tangshi.aspx" response = requests.get(url) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "html.parser") # 获取所有的诗列表 poem_list = soup.find_all("div", class_="typecont") poems = [] # 遍历诗列表,提取相关信息 for poem in poem_list: # 获取诗类型 poem_type = poem.find_previous_sibling("div", class_="bookMl").find("span").text # 获取诗题目 poem_title = poem.find("strong").text # 获取诗内容,去除换行符和制表符 poem_content = poem.find("span", style=False).text.replace("\n", "").replace("\t", "") # 获取诗作者 poem_author = poem.find_all("p")[1].find_all("a")[1].text # 将诗信息添加到列表中 poems.append([poem_type, poem_title, poem_content, poem_author]) return poems # 调用函数获取唐诗三百首的内容 poems_data = get_poems() # 将数据保存到txt文件 with open("poems.txt", "w", encoding="utf-8") as f: for poem in poems_data: f.write("\t".join(poem)) f.write("\n") ``` 任务二:文本特征向量化 ```python from sklearn.feature_extraction.text import TfidfVectorizer poem_contents = [poem[2] for poem in poems_data] # 创建tf-idf向量化对象 vectorizer = TfidfVectorizer() # 对诗内容进行向量化处理 X = vectorizer.fit_transform(poem_contents) # 获取特征词列表 feature_names = vectorizer.get_feature_names() # 将结果保存到txt文件 with open("tfidf_values.txt", "w", encoding="utf-8") as f: for i in range(len(poems_data)): tfidf_values = X[i].toarray().tolist()[0] poem_info = poems_data[i] poem_info.append(tfidf_values) f.write("\t".join(str(x) for x in poem_info)) f.write("\n") ``` 任务三:数据统计分析 ```python import pandas as pd import jieba from collections import Counter from wordcloud import WordCloud # 统计每种诗类型的数量 poem_types = [poem[0] for poem in poems_data] type_counts = Counter(poem_types) type_df = pd.DataFrame(list(type_counts.items()), columns=["诗类型", "数量"]) type_df.to_excel("poem_type_counts.xlsx", index=False) # 统计每个作者的数量 poem_authors = [poem[3] for poem in poems_data] author_counts = Counter(poem_authors) author_df = pd.DataFrame(list(author_counts.items()), columns=["作者", "数量"]) author_df.to_excel("poem_author_counts.xlsx", index=False) # 对诗内容进行中文分词 poem_contents = [poem[2] for poem in poems_data] text = " ".join(poem_contents) words = " ".join(jieba.cut(text)) # 生成词云图 wordcloud = WordCloud(font_path="simhei.ttf", width=800, height=600).generate(words) wordcloud.to_file("wordcloud.png") ``` 以上代码使用了requests库和BeautifulSoup库来请求和解析网页内容,使用了scikit-learn库的TfidfVectorizer类来进行文本特征向量化,使用了pandas库和jieba库进行数据统计分析,使用了wordcloud库生成词云图。部分步骤需要安装相应的库。 [2024-06-23 11:15:17 | AI写代码神器 | 2078点数解答]