python wx 公众号 反爬交流
发表时间:2020-11-7
发布人:葵宇科技
浏览次数:51
爬微信公众号求后续破解
from selenium import webdriver
 import time,json,re,random,requests
driver = webdriver.Chrome()
 driver.get(‘https://mp.weixin.qq.com/’)
 time.sleep(1)
 driver.find_element_by_link_text(“使用帐号登录”).click()
 time.sleep(1)
 driver.find_element_by_name(“account”).clear()
 driver.find_element_by_name(“account”).send_keys()#输入’你的邮箱’
 time.sleep(2)
 driver.find_element_by_name(“password”).clear()
 driver.find_element_by_name(“password”).send_keys()#输入’你的密码’
 driver.find_element_by_class_name(“icon_checkbox”).click()
 time.sleep(2)
 driver.find_element_by_class_name(“btn_login”).click()
 time.sleep(15)
 cookies = driver.get_cookies()
 print(cookies)
 cookie = {}
 for items in cookies :
 cookie[items.get(‘name’)] = items.get(‘value’)
 with open(‘cookies.txt’,‘w’) as file:
 file.write(json.dumps(cookie))
分割线,上面获取cookie
 import time,json,re,random,requests
 from bs4 import BeautifulSoup
def find_cookies():
 with open(“cookies.txt”, “r”) as file:
 cookie = file.read()
 cookies = json.loads(cookie)
 return cookies
def find_token(cookies):
 url = “https://mp.weixin.qq.com”
 response = requests.get(url, cookies=cookies)
 token = re.findall(r’token=(\d+)’, str(response.url))[0]
 return token
def find_account(token,cookies):
 headers = {
 “User-Agent”: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36’,
 “Referer”: “https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=”+token+"&lang=zh_CN",
 “Host”: “mp.weixin.qq.com”
 }
 requests_url = ‘https://mp.weixin.qq.com/cgi-bin/searchbiz’
 authors_list = [‘占豪’,’’]
 authorsnumberlist = []
 for author in authors_list:
 paras_author = {
 ‘action’: ‘search_biz’,
 ‘begin’: ‘0’,
 ‘count’: ‘5’,
 ‘query’: author,
 ‘token’: token,
 ‘lang’: ‘zh_CN’,
 ‘f’: ‘json’,
 ‘ajax’: ‘1’
 }
 res_choose = requests.get(requests_url,params = paras_author,cookies=cookies,headers = headers)
 json_choose = res_choose.json()
    names = json_choose['list']
    for name in names:
        author_name = name['nickname']
        if author_name == author:
            fakeid_number = name['fakeid']
    authorsnumberlist.append(fakeid_number)
    time.sleep(20)
    print(author)
    
return authorsnumberlist
def get_time(time_sj):
 data_sj = time.strptime(time_sj,"%Y-%m-%d") #定义格式
 time_int = int(time.mktime(data_sj))
 timeStamp = time_int
 timeArray = time.localtime(timeStamp)
 otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
 timeArray = time.strptime(otherStyleTime, “%Y-%m-%d”)
 timeStamp = int(time.mktime(timeArray))
 return (timeStamp,timeStamp+86400)
def find_article(list,token,cookies,acquire_time):
 headers = {
 “User-Agent”: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36’,
 “Referer”: “https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=”+token+"&lang=zh_CN",
 “Host”: “mp.weixin.qq.com”
 }
 links = []
 for each_number in list:
 params = {
 ‘action’: ‘list_ex’,
 ‘begin’: ‘0’,
 ‘count’:‘5’,
 ‘fakeid’: str(each_number),
 ‘type’:‘9’,
 ‘query’:’’ ,
 ‘token’: token,
 ‘lang’: ‘zh_CN’,
 ‘f’: ‘json’,
 ‘ajax’: ‘1’
 }
 account_url = ‘https://mp.weixin.qq.com/cgi-bin/appmsg’
 res_account = requests.get(account_url,params = params,cookies=cookies,headers = headers)
 json_account = res_account.json()
 papers = json_account[‘app_msg_list’]
 for each_paper in papers:
 time = each_paper[‘create_time’]
 if time > acquire_time[0] and time< acquire_time[1]:
 link = each_paper[‘link’]
 links.append(link)
 return links
def findandstore_txt(links):
 with open(‘爬文章.txt’,‘a’,encoding = ‘utf-8’) as file:
 for link in links:
 res = requests.get(link)
 soup = BeautifulSoup(res.text,‘html.parser’)
 articlene = soup.find(‘div’,id=‘img-content’)
 content = articlene.text
 file.write(str(content))
 file.write(’\n’)
 file.write(’-------------------------------------------------’)
cookies = find_cookies()
 token = find_token(cookies)
 authorsnumberlist = find_account(token,cookies)
 #your_time = input(‘输入你爬的日期格式 2020-08-11 :’)
 your_time = ‘2020-11-5’
 acquire_time = get_time(your_time)
 links = find_article(authorsnumberlist,token,cookies,acquire_time)
 print(len(links))
 try:
 findandstore_txt(links)
 except UnicodeEncodeError :
 pass
爬到文章并且以txt格式保存。
 list里面公众号加多了,爬多了会被封,有没有大佬帮忙把后续反爬完善一下,或者私信一下解决办法嘛。谢谢了
参考
 https://blog.csdn.net/weixin_41267342/article/details/96729138
 村西那条弯弯的河流 大佬文章完成。
 如有侵权,私信一下。








