import requests from bs4 import BeautifulSoup import csv import time import random def get_anjuke_rental(page): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Referer': 'https://km.anjuke.com/', 'Cookie': 'aQQ_ajkguid=DAD5FF6F-F8C7-3A74-D6D3-DDBD83A22773; ajk-appVersion=; fzq_h=c460b2e3f6fa0423cdfde83f31cc1ca7_1738734512582_d8af942581ce43adab63aaa3da34c3dc_1782055253; id58


代码问题分析 1. 反爬机制检测不全面:仅通过和响应文本进行简单的反爬检测,可能不够全面。 ```python if "验证" in soup.title.text or "安全验证" in response.text: print("触发反爬机制!") return ``` 2. 请求失败处理不完善:在请求失败时没有重试机制,可能导致数据丢失。 ```python if response.status_code != 200: print(f'第 {page} 页请求失败,状态码:{response.status_code}') return ``` 3. 异常处理不充分:在解析房源信息时,如果某个字段缺失或格式不对,程序会直接跳过该房源,没有详细记录错误信息。 ```python except Exception as e: print(f'第 {page} 页第 {index + 1} 个房源解析失败: {str(e)}') ``` 4. 硬编码的Headers和Cookies:这些信息可能会过期或失效,导致爬虫无法正常工作。 ```python headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/Safari/537.36', 'Referer': 'https://km.anjuke.com/', 'Cookie': 'aQQ_ajkguid=DAD5FF6F-F8C7-3A74-D6D3-DDBD83A22773; ajk-appVersion=; fzq_h=c460b2e3f6fa0423cdfde83f31cc1ca7_1738734512582_d8af942581ce43adab63aaa3da34c3dc_1782055253; id58=CkwA6mei+7ohVG2DnYupAg==; ...' } ``` 5. 缺少CSV文件初始化:`writer`对象未定义,导致写入CSV文件时报错。 ```python writer.writerow([title, price, area, layout, orientation, region, tags]) ``` 代码优化方向 1. 增强反爬机制检测:增加更多的反爬检测条件,如检查页面中是否包含特定的反爬元素。 2. 添加请求失败重试机制:使用循环和延时来处理请求失败的情况。 3. 改进异常处理:详细记录每个房源解析失败的原因,并继续处理下一个房源。 4. 动态获取Headers和Cookies:通过模拟登录或其他方式动态获取有效的Headers和Cookies。 5. 初始化CSV文件:在使用`writer`之前先初始化CSV文件。 优化后的代码 ```python import requests from bs4 import BeautifulSoup import csv import time import random def get_anjuke_rental(page): # 动态获取Headers和Cookies(示例) headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/Safari/537.36', 'Referer': 'https://km.anjuke.com/', 'Cookie': 'aQQ_ajkguid=DAD5FF6F-F8C7-3A74-D6D3-DDBD83A22773; aQ...' } url = f'https://km.zu.anjuke.com/fangyuan/p{page}/?kw=' # 注意分页参数 max_retries = 3 for attempt in range(max_retries): try: print(f'正在请求第 {page} 页:{url}') response = requests.get(url, headers=headers, timeout=15) response.encoding = 'utf-8' if response.status_code != 200: print(f'第 {page} 页请求失败,状态码:{response.status_code}') continue soup = BeautifulSoup(response.text, 'lxml') # 检测反爬机制 if "验证" in soup.title.text or "安全验证" in response.text or soup.find('div', class_='verify-container'): print("触发反爬机制!") return house_list = soup.find_all('div', class_='zu-itemmod clearfix') for index, house in enumerate(house_list): try: # title_tag = house.find('b', class_='strongbox') title = title_tag.text.strip() if title_tag else '' # 价格(直接提取<strong>内容) price_tag = house.find('strong', class_='price') price = price_tag.text.strip().replace('元/月', '') if price_tag else '' # 区域信息(清理多余字符) address_tag = house.find('address', class_='details-item tag') if address_tag: for a_tag in address_tag.find_all('a'): a_tag.decompose() region = address_tag.get_text(strip=True) \ .replace('\xa0', '') \ .replace(' ', '') \ .strip('-') else: region = '' # 面积、户型、朝向(从bot-tag提取) bot_tag = house.find('p', class_='details-item tag') details = [span.text.strip() for span in bot_tag.find_all('span', class_='cls-common')] if bot_tag else [] area = layout = orientation = '' for detail in details: if '㎡' in detail: area = detail.replace('㎡', '').strip() elif '室' in detail and '厅' in detail: layout = detail elif '朝' in detail: orientation = detail # 标签信息(直接使用details内容) tags = '|'.join(details) writer.writerow([title, price, area, layout, orientation, region, tags]) except Exception as e: print(f'第 {page} 页第 {index + 1} 个房源解析失败: {str(e)}') break # 如果成功请求并解析,跳出重试循环 except Exception as e: print(f'第 {page} 页请求失败,尝试次数 {attempt + 1}/{max_retries}: {str(e)}') time.sleep(random.uniform(1, 3)) # 随机等待一段时间再重试 continue # 初始化CSV文件 with open('anjuke_rentals.csv', mode='w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['Title', 'Price', 'Area', 'Layout', 'Orientation', 'Region', 'Tags']) # 调用函数获取数据(示例) for page in range(1, 10): # 假设抓取前10页的数据 get_anjuke_rental(page) ``` 解释说明 1. 增强反爬机制检测:增加了对特定反爬元素的检测,确保爬虫能够识别出反爬机制。 2. 添加请求失败重试机制:使用循环和延时来处理请求失败的情况,最多重试三次。 3. 改进异常处理:详细记录每个房源解析失败的原因,并继续处理下一个房源。 4. 动态获取Headers和Cookies:示例中简化了Headers和Cookies的获取过程,实际项目中可以通过模拟登录或其他方式动态获取。 5. 初始化CSV文件:在使用`writer`之前先初始化CSV文件,确保数据能够正确写入。 [2025-02-07 14:32:11 | AI写代码神器 | 1989点数解答]
- import requests from bs4 import BeautifulSoup import csv import time import random def get_anjuke_rental(page): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Referer': 'https://km.anjuke.com/', 'Cookie': 'aQQ_ajkguid=DAD5FF6F-F8C7-3A74-D6D3-DDBD83A22773; ajk-appVersion=; fzq_h=c460b2e3f6fa0423cdfde83f31cc1ca7_1738734512582_d8af942581ce43adab63aaa3da34c3dc_1782055253; id58(1989点数解答 | 2025-02-07 14:32:11)125
- import requests from bs4 import BeautifulSoup import csv import time import random def get_anjuke_rental(page): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Referer': 'https://km.anjuke.com/', 'Cookie': 'aQQ_ajkguid=DAD5FF6F-F8C7-3A74-D6D3-DDBD83A22773; ajk-appVersion=; fzq_h=c460b2e3f6fa0423cdfde83f31cc1ca7_1738734512582_d8af942581ce43adab63aaa3da34c3dc_1782055253; id58(1989点数解答 | 2025-02-07 14:50:47)116
- import requests from bs4 import beautifulsoup # 爬取唐诗三百首的内容 def scrape_data(): url = "https://so.gushiwen.cn/gushi/tangshi.aspx" headers = { "user-agent": "mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/89.0.4389.82 safari/537.36" } response = requests.get(url, headers=headers) if response.status_code == 200: soup = beautifulsoup(response.text, "html.parser") content_divs = soup.find_all('div', c(443点数解答 | 2024-06-24 01:16:00)272
- import openpyxl import smtplib import imaplib import email import json import os import re from bs4 import BeautifulSoup from fastapi import FastAPI, Form from openpyxl.styles import Alignment from email.mime.multipart import MIMEMultipart from email.mime.base import MIMEBase from email.header import decode_header from email import encoders import pandas as pd app = FastAPI() RECEIVER_EMAILS = { 0: "yundongshijie001@protonmail.com", 1: "xiaobudian001@protonmail.com" } email_address = "(182点数解答 | 2025-04-12 00:49:09)163
- import javax.swing.*; import java.awt.*; import java.awt.datatransfer.clipboard; import java.awt.datatransfer.stringselection; import java.awt.datatransfer.transferable; import java.awt.event.*; import java.io.*; import java.nio.file.files; import java.nio.file.path; import java.nio.file.paths; import java.time.localdatetime; import java.util.hashmap; import java.util.list; import java.util.map; import java.util.random; public class copy { static private final jtextarea textarea = new jtext(1497点数解答 | 2024-08-25 09:40:33)321
- import requests from bs4 import beautifulsoup def get_poems(): url = "https://so.gushiwen.cn/gushi/tangshi.aspx" response = requests.get(url) response.encoding = "utf-8" soup = beautifulsoup(response.text, "html.parser") # 获取所有的诗列表 poem_list = soup.find_all("div", class_="typecont") poems = [] # 遍历诗列表,提取相关信息 for poem in poem_list: # 获取诗类型 poem_type = poem.find_previous_sibling("div", class_="bookml").find("span").text # 获取诗题目(316点数解答 | 2024-06-24 00:59:22)235
- import requests from bs4 import beautifulsoup import pandas as pd url = "https://so.gushiwen.cn/gushi/tangshi.aspx" response = requests.get(url) soup = beautifulsoup(response.text, 'html.parser') # 提取所需数据 poems_data = [] for row in soup.find_all('tr', class_='tlist')[1:]: cols = row.find_all('td') type_ = cols.text.strip() title = cols.a.text.strip() content = cols.text.replace('\n', '').replace('\t', '') author = cols.text.strip() poems_data.append([type_, title, conte(120点数解答 | 2024-06-24 02:04:56)260
- 任务一:采集唐诗三百首内容 python import requests from bs4 import beautifulsoup import pandas as pd url = "https://so.gushiwen.cn/gushi/tangshi.aspx" response = requests.get(url) soup = beautifulsoup(response.text, "html.parser") poems = [] for poem in soup.find_all("div", class_="item"): title = poem.find("h1").text.strip() content = poem.find("p").text.strip() author = poem.find("span", class_="author").text.strip() poem_type = "唐诗" poems.append((15点数解答 | 2024-06-24 15:46:11)280
- from kivy.app import app from kivy.uix.button import button from kivy.uix.boxlayout import boxlayout from kivy.uix.filechooser import filechooserlistview from kivy.uix.popup import popup from kivy.uix.label import label from kivy.uix.screenmanager import screenmanager, screen from kivy.core.window import window from kivy.uix.treeview import treeview, treeviewlabel from unitypy import assetsmanager from unitypy.exceptions import unitypyerror import os from pil import image import time class file(262点数解答 | 2024-12-01 17:07:07)217
- import os import datetime from flask import Flask, request, jsonify import requests from flask_cors import CORS import re import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import numpy as np import base64 from io import BytesIO import pandas as pd import traceback # 添加traceback以获取详细错误信息 plt.rcParams["font.sans-serif"] = ["SimHei"] app = Flask(__name__) CORS(app) FASTGPT_API_URL = 'http://localhost:3000/api/v1/chat/completions' FASTGPT_API_KEY = 'fastgpt-gWzitHpBa8XRr0q(713点数解答 | 2025-06-18 16:00:34)117
- # -*- coding:utf-8 -*- # file_name vip_video_final.py __author__ = 'charon' import tkinter as tk import webbrowser from tkinter import ttk from tkinter.messagebox import showwarning import requests from bs4 import beautifulsoup txt_list = [] mapping = dict() def center_window(window, w, h): # 获取屏幕 宽、高 ws = window.winfo_screenwidth() hs = window.winfo_screenheight() # 计算 x, y 位置 x = (ws / 2) - (w / 2) y = (hs / 2) - (h / 2) window.geometry('%dx%d+%d+%d' % ((314点数解答 | 2024-08-02 14:49:22)249
- 这段代码可以优化吗?:import threading import time import random from ascript.ios.screen import findcolors, findimages from ascript.ios.system import r from ascript.ios import action # 随机点击函数 def random_click(x, y): offset_x = random.randint(-5, 5) offset_y = random.randint(-5, 5) action.click(x + offset_x, y + offset_y) # 生命值监控模块 def health_monitor(): while true: health_check = findcolors("635,20,#351614|636,31,#220704",rect=[632,16,640,39]).find_all() if health_check:(721点数解答 | 2024-10-19 15:16:56)211