python - 使用 python 请求会话登录 LinkedIn

Question

我正在尝试使用 Python 请求登录 LinkedIn：

import sys
import requests
from BeautifulSoup import BeautifulSoup


payload={
    'session-key' : 'user@email.com',
    'session-password' : 'password'
}

URL='https://www.linkedin.com/uas/login-submit'
s=requests.session()
s.post(URL,data=payload)

r=s.get('http://www.linkedin.com/nhome')
soup = BeautifulSoup(r.text)
print soup.find('title')

我似乎无法使用此方法登录。我什至尝试在有效负载中使用 csrf 等，但会话不应该为你处理吗？

注意最后一行：我使用标题来检查我是否已成功登录。（如果我已登录，我应该看到“欢迎！| LinkedIn”，而不是看到“世界上最大的专业网络 | LinkedIn”

我错过了什么吗？

score 20 · Accepted Answer

我修改了一个网络抓取模板，用于满足我的大多数基于 Python 的抓取需求，以满足您的需求。验证它与我自己的登录信息一起使用。

它的工作方式是模仿浏览器并维护一个 cookieJar 来存储您的用户会话。让它与 BeautifulSoup 一起为你工作。

注意：这是 Python2 版本。我应要求在下面进一步添加了一个工作 Python3 示例。

import cookielib
import os
import urllib
import urllib2
import re
import string
from BeautifulSoup import BeautifulSoup

username = "user@email.com"
password = "password"

cookie_filename = "parser.cookies.txt"

class LinkedInParser(object):

    def __init__(self, login, password):
        """ Start up... """
        self.login = login
        self.password = password

        # Simulate browser with cookies enabled
        self.cj = cookielib.MozillaCookieJar(cookie_filename)
        if os.access(cookie_filename, os.F_OK):
            self.cj.load()
        self.opener = urllib2.build_opener(
            urllib2.HTTPRedirectHandler(),
            urllib2.HTTPHandler(debuglevel=0),
            urllib2.HTTPSHandler(debuglevel=0),
            urllib2.HTTPCookieProcessor(self.cj)
        )
        self.opener.addheaders = [
            ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                           'Windows NT 5.2; .NET CLR 1.1.4322)'))
        ]

        # Login
        self.loginPage()

        title = self.loadTitle()
        print title

        self.cj.save()


    def loadPage(self, url, data=None):
        """
        Utility function to load HTML from URLs for us with hack to continue despite 404
        """
        # We'll print the url in case of infinite loop
        # print "Loading URL: %s" % url
        try:
            if data is not None:
                response = self.opener.open(url, data)
            else:
                response = self.opener.open(url)
            return ''.join(response.readlines())
        except:
            # If URL doesn't load for ANY reason, try again...
            # Quick and dirty solution for 404 returns because of network problems
            # However, this could infinite loop if there's an actual problem
            return self.loadPage(url, data)

    def loginPage(self):
        """
        Handle login. This should populate our cookie jar.
        """
        html = self.loadPage("https://www.linkedin.com/")
        soup = BeautifulSoup(html)
        csrf = soup.find(id="loginCsrfParam-login")['value']

        login_data = urllib.urlencode({
            'session_key': self.login,
            'session_password': self.password,
            'loginCsrfParam': csrf,
        })

        html = self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
        return

    def loadTitle(self):
        html = self.loadPage("https://www.linkedin.com/feed/")
        soup = BeautifulSoup(html)
        return soup.find("title")

parser = LinkedInParser(username, password)

2014 年 6 月 19 日更新：添加了从主页解析 CSRF 令牌以用于更新的登录过程。

2015 年 7 月 23 日更新：在此处添加 Python 3 示例。基本上需要替换库位置并删除不推荐使用的方法。它没有完美的格式或任何东西，但它可以工作。很抱歉赶工。最后，原则和步骤是相同的。

import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup

username = "user@email.com"
password = "password"

cookie_filename = "parser.cookies.txt"

class LinkedInParser(object):

    def __init__(self, login, password):
        """ Start up... """
        self.login = login
        self.password = password

        # Simulate browser with cookies enabled
        self.cj = cookielib.MozillaCookieJar(cookie_filename)
        if os.access(cookie_filename, os.F_OK):
            self.cj.load()
        self.opener = urllib.request.build_opener(
            urllib.request.HTTPRedirectHandler(),
            urllib.request.HTTPHandler(debuglevel=0),
            urllib.request.HTTPSHandler(debuglevel=0),
            urllib.request.HTTPCookieProcessor(self.cj)
        )
        self.opener.addheaders = [
            ('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; '
                           'Windows NT 5.2; .NET CLR 1.1.4322)'))
        ]

        # Login
        self.loginPage()

        title = self.loadTitle()
        print(title)

        self.cj.save()


    def loadPage(self, url, data=None):
        """
        Utility function to load HTML from URLs for us with hack to continue despite 404
        """
        # We'll print the url in case of infinite loop
        # print "Loading URL: %s" % url
        try:
            if data is not None:
                response = self.opener.open(url, data)
            else:
                response = self.opener.open(url)
            return ''.join([str(l) for l in response.readlines()])
        except Exception as e:
            # If URL doesn't load for ANY reason, try again...
            # Quick and dirty solution for 404 returns because of network problems
            # However, this could infinite loop if there's an actual problem
            return self.loadPage(url, data)

    def loadSoup(self, url, data=None):
        """
        Combine loading of URL, HTML, and parsing with BeautifulSoup
        """
        html = self.loadPage(url, data)
        soup = BeautifulSoup(html, "html5lib")
        return soup

    def loginPage(self):
        """
        Handle login. This should populate our cookie jar.
        """
        soup = self.loadSoup("https://www.linkedin.com/")
        csrf = soup.find(id="loginCsrfParam-login")['value']
        login_data = urllib.parse.urlencode({
            'session_key': self.login,
            'session_password': self.password,
            'loginCsrfParam': csrf,
        }).encode('utf8')

        self.loadPage("https://www.linkedin.com/uas/login-submit", login_data)
        return

    def loadTitle(self):
        soup = self.loadSoup("https://www.linkedin.com/feed/")
        return soup.find("title")

parser = LinkedInParser(username, password)

score 13 · Accepted Answer

这是一个更简单的版本。

import requests
from bs4 import BeautifulSoup

client = requests.Session()

HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'

html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find(id="loginCsrfParam-login")['value']

login_information = {
    'session_key':'Login',
    'session_password':'Password',
    'loginCsrfParam': csrf,
}

client.post(LOGIN_URL, data=login_information)

client.get('Any_Linkedin_URL')

score 3 · Accepted Answer

2019 版。

稍微修改版本的工作，考虑到页面的新结构来查找连接 cookie 并添加 trk 参数。

import requests
from bs4 import BeautifulSoup

email = ""
password = ""

client = requests.Session()

HOMEPAGE_URL = 'https://www.linkedin.com'
LOGIN_URL = 'https://www.linkedin.com/uas/login-submit'

html = client.get(HOMEPAGE_URL).content
soup = BeautifulSoup(html, "html.parser")
csrf = soup.find('input', {'name': 'loginCsrfParam'}).get('value')

login_information = {
    'session_key': email,
    'session_password': password,
    'loginCsrfParam': csrf,
    'trk': 'guest_homepage-basic_sign-in-submit'
}

client.post(LOGIN_URL, data=login_information)

response = client.get('')

score 2 · Accepted Answer

@garromark 接受的解决方案的 2020 版本：

import http.cookiejar as cookielib
import os
import urllib
import re
import string
from bs4 import BeautifulSoup

username = ""
password = ""

cookie_filename = "parser.cookies.txt"


class LinkedInParser(object):

    def __init__(self, login, password):
        """ Start up... """
        self.login = login
        self.password = password

        # Simulate browser with cookies enabled
        self.cj = cookielib.MozillaCookieJar(cookie_filename)
        if os.access(cookie_filename, os.F_OK):
            self.cj.load()
        self.opener = urllib.request.build_opener(
            urllib.request.HTTPRedirectHandler(),
            urllib.request.HTTPHandler(debuglevel=0),
            urllib.request.HTTPSHandler(debuglevel=0),
            urllib.request.HTTPCookieProcessor(self.cj)
        )
        self.opener.addheaders = [
            ('User-agent', 'Mozilla/5.0')
        ]

        # Login
        self.loginPage()

        title = self.loadTitle()
        print(title)

        # self.cj.save()

    def loadPage(self, url, data=None):
        """
        Utility function to load HTML from URLs for us with hack to continue despite 404
        """
        # We'll print the url in case of infinite loop
        # print "Loading URL: %s" % url
        try:
            if data is not None:
                response = self.opener.open(url, data)
            else:
                response = self.opener.open(url)
            content = ''.join([str(l) for l in response.readlines()])
            print("Page loaded: %s \n Content: %s \n" % (url, content))
            return content
        except Exception as e:
            # If URL doesn't load for ANY reason, try again...
            # Quick and dirty solution for 404 returns because of network problems
            # However, this could infinite loop if there's an actual problem
            print("Exception on %s load: %s" % (url, e))
            # return self.loadPage(url, data)

    def loadSoup(self, url, data=None):
        """
        Combine loading of URL, HTML, and parsing with BeautifulSoup
        """
        html = self.loadPage(url, data)
        soup = BeautifulSoup(html, "html5lib")
        return soup

    def loginPage(self):
        """
        Handle login. This should populate our cookie jar.
        """
        soup = self.loadSoup("https://www.linkedin.com/login")
        loginCsrfParam = soup.find("input", {"name": "loginCsrfParam"})['value']
        csrfToken = soup.find("input", {"name": "csrfToken"})['value']
        sIdString = soup.find("input", {"name": "sIdString"})['value']
        print("loginCsrfParam: %s" % loginCsrfParam)
        print("csrfToken: %s" % csrfToken)
        print("sIdString: %s" % sIdString)
        login_data = urllib.parse.urlencode({
            'session_key': self.login,
            'session_password': self.password,
            'loginCsrfParam': loginCsrfParam,
            'csrfToken': csrfToken,
            'sIdString': sIdString
        }).encode('utf8')

        self.loadPage("https://www.linkedin.com/checkpoint/lg/login-submit", login_data)

    def loadTitle(self):
        soup = self.loadSoup("https://www.linkedin.com/feed/")
        return soup.find("title")


parser = LinkedInParser(username, password)

score 1 · Accepted Answer

OP 的解决方案对我有用，只需稍作修改。

将“会话密钥”更改为“会话密钥”并将“会话密码”更改为会话密码。

除此之外，代码本身就很好。

python - 使用 python 请求会话登录 LinkedIn

5 回答 5

Related

Reference