1

我需要下载多个 10-ks 文档,但是,如果我下载 5-10 家公司之间的 10-ks,此代码可以正常工作。但如果我增加 [cik_lookup 函数] 中的公司数量。这是代码。

import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper
from tqdm import tqdm

这是包含 project_helper 函数的 py 文件。

import matplotlib.pyplot as plt
import requests

from ratelimit import limits, sleep_and_retry


class SecAPI(object):
    SEC_CALL_LIMIT = {'calls': 10, 'seconds': 1}

    @staticmethod
    @sleep_and_retry
    # Dividing the call limit by half to avoid coming close to the limit
    @limits(calls=SEC_CALL_LIMIT['calls'] / 2, period=SEC_CALL_LIMIT['seconds'])
    def _call_sec(url):
        return requests.get(url)

    def get(self, url):
        return self._call_sec(url).text


def print_ten_k_data(ten_k_data, fields, field_length_limit=50):
    indentation = '  '

    print('[')
    for ten_k in ten_k_data:
        print_statement = '{}{{'.format(indentation)
        for field in fields:
            value = str(ten_k[field])

            # Show return lines in output
            if isinstance(value, str):
                value_str = '\'{}\''.format(value.replace('\n', '\\n'))
            else:
                value_str = str(value)

            # Cut off the string if it gets too long
            if len(value_str) > field_length_limit:
                value_str = value_str[:field_length_limit] + '...'

            print_statement += '\n{}{}: {}'.format(indentation * 2, field, value_str)

        print_statement += '},'
        print(print_statement)
    print(']')

第一步是下载 NLP Corpora。

nltk.download('stopwords')
nltk.download('wordnet')

比得到 10ks

#cik_lookup = {
#    'GOOGL':'0001288776',
#    'AAPL':'0000320193',
#    'FACEBOOK':'0001326801',
#    'AMZN':'0001018724',
#    'MSFT':'0000789019'}

cik_lookup = {
    'AEP': '0000004904',
    'AXP': '0000004962',
    'BA': '0000012927', 
    'BK': '0001390777',
    'CAT': '0000018230',
    'DE': '0000315189',
    'DIS': '0001001039',
    'DTE': '0000936340',
    'ED': '0001047862',
    'EMR': '0000032604',
    'ETN': '0001551182',
    'GE': '0000040545',
    'IBM': '0000051143',
    'IP': '0000051434',
    'JNJ': '0000200406',
    'KO': '0000021344',
    'LLY': '0000059478',
    'MCD': '0000063908',
    'MO': '0000764180',
    'MRK': '0000310158',
    'MRO': '0000101778',
    'PCG': '0001004980',
    'PEP': '0000077476',
    'PFE': '0000078003',
    'PG': '0000080424',
    'PNR': '0000077360',
    'SYY': '0000096021',
    'TXN': '0000097476',
    'UTX': '0000101829',
    'WFC': '0000072971',
    'WMT': '0000104169',
    'WY': '0000106535',
    'XOM': '0000034088'}

获取 10-ks 列表

sec_api = project_helper.SecAPI()


from bs4 import BeautifulSoup
    def get_sec_data(cik, doc_type, start=0, count=60):
        newest_pricing_data = pd.to_datetime('2021-01-01')
        rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
            '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
            .format(cik, doc_type, start, count)
        sec_data = sec_api.get(rss_url)
        feed = BeautifulSoup(sec_data.encode('utf-8'), 'xml').feed
        entries = [
            (
                entry.content.find('filing-href').getText(),
                entry.content.find('filing-type').getText(),
                entry.content.find('filing-date').getText())
            for entry in feed.find_all('entry', recursive=False)
            if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
    
        return entries
    
    example_ticker = 'AEP'
    sec_data = {}
    
    for ticker, cik in cik_lookup.items():
        sec_data[ticker] = get_sec_data(cik, '10-K')

如果我下载 5-10 家公司之间的 10-ks,代码可以正常工作。但是,如果我在 [cik_lookup 函数] 中增加公司的数量,我会收到以下错误。我得到的第一个错误如下。

UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-8-28a784054794> in <module>()
     20 
     21 for ticker, cik in cik_lookup.items():
---> 22     sec_data[ticker] = get_sec_data(cik, '10-K')

<ipython-input-8-28a784054794> in get_sec_data(cik, doc_type, start, count)
      5     rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany'         '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom'         .format(cik, doc_type, start, count)
      6     sec_data = sec_api.get(rss_url)
----> 7     feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
      8     entries = [
      9         (

UnicodeEncodeError: 'ascii' codec can't encode characters in position 2599-2601: ordinal not in range(128)

但是,在对 BeutifulSoup(ecodes) 进行了一些谷歌搜索后,我将其更改为 utf-8,然后出现以下错误。

  ---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-9c77ed07af2d> in <module>()
     20 
     21 for ticker, cik in cik_lookup.items():
---> 22     sec_data[ticker] = get_sec_data(cik, '10-K')

<ipython-input-9-9c77ed07af2d> in get_sec_data(cik, doc_type, start, count)
     11             entry.content.find('filing-type').getText(),
     12             entry.content.find('filing-date').getText())
---> 13         for entry in feed.find_all('entry', recursive=False)
     14         if pd.to_datetime(entry.content.find('filing-date').getText()) <= newest_pricing_data]
     15 

AttributeError: 'NoneType' object has no attribute 'find_all'

可以在以下 github 存储库中访问该项目。 github repo也在这里。

4

0 回答 0