2024-09-23
爬虫
00

目录

INS爬虫
cookie的样例
代理
原始代码
是否需要添加cookie

原始git链接

INS爬虫

最近有很多朋友发邮件和私信询问ins爬虫的问题,我重新去看了一下,然后把新版更新了一下。

还有由于tls指纹,我讲请求的第三方库调整为tls_client模拟浏览器调用。

出现问题的主要原因是请求头的参数提取有问题,新版的请求头已经不是原来的那个样子了,这样会导致请求的返回是这样的:

json
{'message': 'useragent mismatch', 'status': 'fail'}

我们修改请求头后就可以得到准确的返回结果

json
{ "biography": "", "username": "renebaebae", "fbid": "17841419632210822", "full_name": "IRENE", "id": "19446582407", "followed_by": 11689096, "follow": 4, "avatar": "https://scontent-nrt1-1.cdninstagram.com/v/t51.2885-19/69067759_957976681207922_446652332218777600_n.jpg?stp=dst-jpg_s320x320&_nc_ht=scontent-nrt1-1.cdninstagram.com&_nc_cat=1&_nc_ohc=lqyt1uLD7bkQ7kNvgElH_Vc&_nc_gid=da9e9a13bbbb40afb7ca800ef16e8ded&edm=AOQ1c0wBAAAA&ccb=7-5&oh=00_AYDS_6vHqjNGgGPP1uGP2VCpuzkcf-UR4zaatJ_IPdTFJg&oe=66F7168F&_nc_sid=8b3546", "noteCount": 165, "is_private": false, "is_verified": true }

cookie的样例

还有部分朋友不知道cookie的格式,这里我也把cookie的格式贴下:

json
{ "wd": "", "dpr": "", "mid": "", "datr": "", "ig_did": "", "ig_nrcb": "", "ps_l": "1", "ps_n": "1", "csrftoken": "", "ds_user_id": "", "sessionid": "", "shbid": "", "shbts": "", "rur": "" }

大概需要这些内容,简单的方法是把页面的cookie字符串贴下,然后找一个在线转化的转为python的字典即可。

代理

还有一部分朋友询问代理如何添加,这里我也将代理抽离到全局进行添加了

json
proxy = { "http": "http://127.0.0.1:17890", "https": "http://127.0.0.1:17890", }

按照python的格式修改即可

原始代码

python
import time from tls_client import Session # which need login cookie # user info, not necessarily # post, need # comment, need # cookie example # { # "wd": "", # "dpr": "", # "mid": "", # "datr": "", # "ig_did": "", # "ig_nrcb": "", # "ps_l": "1", # "ps_n": "1", # "csrftoken": "", # "ds_user_id": "", # "sessionid": "", # "shbid": "", # "shbts": "", # "rur": "" # } cookie = { # your cookie } PARAMS = r'("app_id":\s*"[^"]+")|("claim":\s*"[^"]+")|("csrf_token":\s*"[^"]+")|(["LSD",[],{"token":\s*"[^"]+")' URLS = [ 'https://i.instagram.com/', 'https://i.instagram.com/api/v1/users/web_profile_info/', 'https://i.instagram.com/api/v1/feed/user', 'https://i.instagram.com/api/v1/media/' ] # proxy, selectable proxy = { "http": "http://127.0.0.1:17890", "https": "http://127.0.0.1:17890", } class Ins: def __init__(self, cookies: dict): self.cookies = cookies self.session = Session(client_identifier="chrome_104", random_tls_extension_order=True) self.session.proxies.update(proxy) self.headers = { 'sec-fetch-mode': 'cors', 'referer': 'https://www.instagram.com/', 'x-ig-app-id': '936619743392459', 'sec-fetch-site': 'same-site', 'accept-language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'x-asbd-id': '198387', 'accept': '*/*', 'sec-ch-ua': 'Chromium";v="104", " Not A;Brand";v="99", "Google Chrome";v="104"', 'sec-ch-ua-mobile': '?0', 'x-ig-www-claim': 'hmac.AR11qy__GsvLpiS4wKBygLGdxs2DxJB1esTkBw7b2QFaHH2d', 'authority': 'i.instagram.com', 'sec-ch-ua-platform': 'Windows"', 'x-instagram-ajax': '1006400593', 'sec-fetch-dest': 'empty', 'user-agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.102 Safari/537.36' } def ajax_request(self, url: str, /, params=None): """ do requests, the engine of class :param url: api url :param params: api params :return: json object """ for _ in range(5): try: resp = self.session.get(url, headers=self.headers, params=params, cookies=self.cookies) return resp.json() except: time.sleep(15) else: return None def get_userInfo(self, userName: str): """ get user info by username :param userName: name of user :return: dict of user info """ params = { 'username': userName, } resp = self.ajax_request(URLS[1], params=params) if resp: try: # to avoid exception? Internet went wrong may return wrong information data = resp['data']['user'] except KeyError: raise 'Could not get user information...' return { 'biography': data.get('biography'), 'username': data.get('username'), 'fbid': data.get('fbid'), 'full_name': data.get('full_name'), 'id': data.get('id'), 'followed_by': data.get('edge_followed_by', {}).get('count'), 'follow': data.get('edge_follow', {}).get('count'), 'avatar': data.get('profile_pic_url_hd'), 'noteCount': data.get('edge_owner_to_timeline_media', {}).get('count'), 'is_private': data.get('is_private'), 'is_verified': data.get('is_verified') } if data else 'unknown User' def get_userPosts(self, userName: str): """ get all posts from the username :param userName: name :return: generator """ continuations = [{ 'count': '12', }] temp = userName + '/username/' while continuations: continuation = continuations.pop() # url will change when second request and later url = URLS[2] + f'/{temp}' resp = self.ajax_request(url, params=continuation) # no such user if not resp.get('user'): yield 'checking cookie or unknown/private User: {}'.format(userName) else: _items = resp.get('items') # simulate the mousedown if resp.get('more_available'): continuations.append({'count': '12', 'max_id': resp.get('next_max_id')}) user = resp.get('user') temp = user.get('pk_id') if user.get('pk_id') else user.get('pk') yield from self.extract_post(_items) def get_comments(self, id): """ get comments by given post id :param id: :return: generator of comments """ continuations = [{ 'can_support_threading': 'true', 'permalink_enabled': 'false', }] # base url url = URLS[3] + f'{id}/comments/' while continuations: continuation = continuations.pop() resp = self.ajax_request(url, params=continuation) if resp.get('next_min_id'): continuations.append({ 'can_support_threading': 'true', 'min_id': resp.get('next_min_id') }) comments = resp.get('comments') if comments: for comment in comments: yield { 'id': comment.get('pk'), 'user_name': comment.get('user', {}).get('username'), 'user_fullname': comment.get('user', {}).get('full_name'), 'text': comment.get('text'), 'created_at': comment.get('created_at'), 'comment_like_count': comment.get('comment_like_count'), 'reply_count': comment.get('child_comment_count') } if comment.get('child_comment_count') > 0: yield from self.get_child_comment(id, comment.get('pk')) else: yield 'no comments or losing login cookies' def get_child_comment(self, main_id, id): """ get child of the comment by comment_id, only used in function get_comments(). :param main_id: post id :param id: comment_id :return: to comments generator """ url = f'https://www.instagram.com/api/v1/media/{main_id}/comments/{id}/child_comments/' continuations = [{'max_id': ''}] while continuations: continuation = continuations.pop() resp = self.ajax_request(url, params=continuation) cursor = resp.get('next_max_child_cursor') if cursor: continuations.append({'max_id': cursor}) comments = resp.get('child_comments') if comments: for comment in comments: yield { 'id': comment.get('pk'), 'user_name': comment.get('user', {}).get('username'), 'user_fullname': comment.get('user', {}).get('full_name'), 'text': comment.get('text'), 'created_at': comment.get('created_at'), 'comment_like_count': comment.get('comment_like_count'), } @staticmethod def extract_post(posts): """ to extract a post from a list of posts :param posts: original instagram posts :return: dict of posts """ for post in posts: caption = post.get('caption') item = { 'code': post.get('code'), 'id': post.get('pk'), 'pk_id': post.get('id'), 'comment_count': post.get('comment_count'), 'like_count': post.get('like_count'), 'text': caption.get('text') if caption else None, 'created_at': caption.get('created_at') if caption else post.get('taken_at'), } # other type can be added by yourself types = post.get('media_type') item.update({ 'photo': [_.get('image_versions2', {}).get('candidates', [{}])[0].get('url') for _ in post.get('carousel_media')] }) if types == 8 else None item.update({ 'video': post.get('video_versions', [{}])[0].get('url') }) if types == 2 else None item.update({ 'photo': post.get('image_versions2', {}).get('candidates', [{}])[0].get('url') }) if types == 1 else None yield item if __name__ == '__main__': INS = Ins(cookie) # items = INS.get_userPosts('renebaebae') items = INS.get_comments('3092771276598639274') for item in items: print(item) break item = INS.get_userInfo('renebaebae') print(item)

是否需要添加cookie

类型是否需要cookie
user_info
post
comment
如果对你有用的话,可以打赏哦
打赏
ali pay
wechat pay

本文作者:回锅炒辣椒

本文链接:

版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!