python wx 公众号反爬交流

发表时间：2020-11-7

发布人：葵宇科技

浏览次数：51

爬微信公众号求后续破解

from selenium import webdriver
import time,json,re,random,requests

driver = webdriver.Chrome()
driver.get(‘https://mp.weixin.qq.com/’)
time.sleep(1)
driver.find_element_by_link_text(“使用帐号登录”).click()
time.sleep(1)
driver.find_element_by_name(“account”).clear()
driver.find_element_by_name(“account”).send_keys()#输入’你的邮箱’
time.sleep(2)
driver.find_element_by_name(“password”).clear()
driver.find_element_by_name(“password”).send_keys()#输入’你的密码’
driver.find_element_by_class_name(“icon_checkbox”).click()
time.sleep(2)
driver.find_element_by_class_name(“btn_login”).click()
time.sleep(15)
cookies = driver.get_cookies()
print(cookies)
cookie = {}
for items in cookies :
cookie[items.get(‘name’)] = items.get(‘value’)
with open(‘cookies.txt’,‘w’) as file:
file.write(json.dumps(cookie))

分割线，上面获取cookie
import time,json,re,random,requests
from bs4 import BeautifulSoup

def find_cookies():
with open(“cookies.txt”, “r”) as file:
cookie = file.read()
cookies = json.loads(cookie)
return cookies

def find_token(cookies):
url = “https://mp.weixin.qq.com”
response = requests.get(url, cookies=cookies)
token = re.findall(r’token=(\d+)’, str(response.url))[0]
return token

def find_account(token,cookies):
headers = {
“User-Agent”: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36’,
“Referer”: “https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=”+token+"&lang=zh_CN",
“Host”: “mp.weixin.qq.com”
}
requests_url = ‘https://mp.weixin.qq.com/cgi-bin/searchbiz’
authors_list = [‘占豪’,’’]
authorsnumberlist = []
for author in authors_list:
paras_author = {
‘action’: ‘search_biz’,
‘begin’: ‘0’,
‘count’: ‘5’,
‘query’: author,
‘token’: token,
‘lang’: ‘zh_CN’,
‘f’: ‘json’,
‘ajax’: ‘1’
}
res_choose = requests.get(requests_url,params = paras_author,cookies=cookies,headers = headers)
json_choose = res_choose.json()

    names = json_choose['list']
    for name in names:
        author_name = name['nickname']
        if author_name == author:
            fakeid_number = name['fakeid']
    authorsnumberlist.append(fakeid_number)

    time.sleep(20)
    print(author)
    

return authorsnumberlist

def get_time(time_sj):
data_sj = time.strptime(time_sj,"%Y-%m-%d") #定义格式
time_int = int(time.mktime(data_sj))
timeStamp = time_int
timeArray = time.localtime(timeStamp)
otherStyleTime = time.strftime("%Y-%m-%d", timeArray)
timeArray = time.strptime(otherStyleTime, “%Y-%m-%d”)
timeStamp = int(time.mktime(timeArray))
return (timeStamp,timeStamp+86400)

def find_article(list,token,cookies,acquire_time):
headers = {
“User-Agent”: ‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.183 Safari/537.36’,
“Referer”: “https://mp.weixin.qq.com/cgi-bin/appmsg?t=media/appmsg_edit_v2&action=edit&isNew=1&type=10&token=”+token+"&lang=zh_CN",
“Host”: “mp.weixin.qq.com”
}
links = []
for each_number in list:
params = {
‘action’: ‘list_ex’,
‘begin’: ‘0’,
‘count’:‘5’,
‘fakeid’: str(each_number),
‘type’:‘9’,
‘query’:’’ ,
‘token’: token,
‘lang’: ‘zh_CN’,
‘f’: ‘json’,
‘ajax’: ‘1’
}
account_url = ‘https://mp.weixin.qq.com/cgi-bin/appmsg’
res_account = requests.get(account_url,params = params,cookies=cookies,headers = headers)
json_account = res_account.json()
papers = json_account[‘app_msg_list’]
for each_paper in papers:
time = each_paper[‘create_time’]
if time > acquire_time[0] and time< acquire_time[1]:
link = each_paper[‘link’]
links.append(link)
return links

def findandstore_txt(links):
with open(‘爬文章.txt’,‘a’,encoding = ‘utf-8’) as file:
for link in links:
res = requests.get(link)
soup = BeautifulSoup(res.text,‘html.parser’)
articlene = soup.find(‘div’,id=‘img-content’)
content = articlene.text
file.write(str(content))
file.write(’\n’)
file.write(’-------------------------------------------------’)

cookies = find_cookies()
token = find_token(cookies)
authorsnumberlist = find_account(token,cookies)
#your_time = input(‘输入你爬的日期格式 2020-08-11 :’)
your_time = ‘2020-11-5’
acquire_time = get_time(your_time)
links = find_article(authorsnumberlist,token,cookies,acquire_time)
print(len(links))
try:
findandstore_txt(links)
except UnicodeEncodeError :
pass

爬到文章并且以txt格式保存。
list里面公众号加多了，爬多了会被封，有没有大佬帮忙把后续反爬完善一下，或者私信一下解决办法嘛。谢谢了

参考
https://blog.csdn.net/weixin_41267342/article/details/96729138
村西那条弯弯的河流大佬文章完成。
如有侵权，私信一下。