1

好吧,我是 python 新手,我试图对推文进行标记和阻止以创建模型,然后使用 gridsearch 找到最佳超参数,我愿意接受任何类型的反馈

这是我的代码:

import nltk
nltk.download("stopwords")

from nltk.corpus import stopwords

spanish_stopwords = stopwords.words('spanish')

from string import punctuation
non_words = list(punctuation)

#we add spanish punctuation
non_words.extend(['¿', '¡'])
non_words.extend(map(str,range(10)))

from sklearn.feature_extraction.text import CountVectorizer       
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
stemmer = SnowballStemmer('spanish')
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
    stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = ''.join([c for c in text if c not in non_words])
    # tokenize
    tokens =  word_tokenize(text)
    # stem
    try:
        stems = stem_tokens(tokens, stemmer)

    except Exception as e:
        print(e)
        print(text)
        stems = ['']
    return stems    

from sklearn.cross_validation import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline

tweets_corpus = tweets_corpus[tweets_corpus.polarity != 'NEU']

tweets_corpus['polarity_bin'] = 0
tweets_corpus.polarity_bin[tweets_corpus.polarity.isin(['P', 'P+'])] = 1
print(tweets_corpus.polarity_bin.value_counts(normalize=True))
if __name__ == '__main__':
    import tokenize
    vectorizer = CountVectorizer(
            analyzer = 'word',
            tokenizer = tokenize,
            lowercase = True,
            stop_words = spanish_stopwords)

    pipeline = Pipeline([
         ('vect', vectorizer),
         ('cls', LinearSVC()),
    ])

    parameters = {
        'vect__max_df': (0.5, 1.9),
        'vect__min_df': (10, 20,50),
        'vect__max_features': (500, 1000),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        'cls__C': (0.2, 0.5, 0.7),
        'cls__loss': ('hinge', 'squared_hinge'),
        'cls__max_iter': (500, 1000)
     }

    from time import time
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,scoring='roc_auc')
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
    print(grid_search.best_params_)
    t0 = time()
    print("done in %0.3fs" % (time() - t0))

这是我尝试升级的数据示例

Name: polarity_bin, dtype: float64
      agreement  \
270         NaN   
208         NaN   
902         NaN   
31056       NaN   
1158        NaN   

                                                                                                                                          content  \
270                     @revolucion2017 @Pablo_Iglesias_  Cultura  es     reflexionar sobre algo q ha dicho alguien y si te gusta hacerlo tuyo.pq no?   
208    @_UnaOpinionMas_ @PPopular En eso estoi de acuerdo por lo menos al PP     se le ve que hace cosas y contara d nuevo cn mi  voto  como siempre.   
902                                                            "Grande      Casillas  : ""Esta victoria no solo es nuestra sino también de  Jesé ."""   
31056         ¿Querían que Contador analizara cualquier cosa que fuera a     tomar o que la vomitara meses después para mandarla al puto laboratorio?   
1158                                     Eliminados de  champion , van     terceros en la  Liga  y pierden la  final copa del Rey , PURO  REAL MADRID   

      polarity  polarity_bin  
270          P             1  
208          P             1  
902          P             1  
31056        N             0  
1158         N             0  

这是错误:


TypeError                                 Traceback (most recent call last)
<ipython-input-9-7c9b6a1bac93> in <module>()
    201     print("Performing grid search...")
    202     print("pipeline:", [name for name, _ in pipeline.steps])
--> 203     grid_search.fit(tweets_corpus.content, tweets_corpus.polarity_bin)
    204     print(grid_search.best_params_)
    205     t0 = time()

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\grid_search.py in fit(self, X, y)
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
    805 
    806 

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\grid_search.py in _fit(self, X, y, parameter_iterable)
    539                                          n_candidates * len(cv)))
    540 
--> 541         base_estimator = clone(self.estimator)
    542 
    543         pre_dispatch = self.pre_dispatch

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
     49     new_object_params = estimator.get_params(deep=False)
     50     for name, param in six.iteritems(new_object_params):
---> 51         new_object_params[name] = clone(param, safe=False)
     52     new_object = klass(**new_object_params)
     53     params_set = new_object.get_params(deep=False)

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
     37     # XXX: not handling dictionaries
     38     if estimator_type in (list, tuple, set, frozenset):
---> 39         return estimator_type([clone(e, safe=safe) for e in estimator])
     40     elif not hasattr(estimator, 'get_params'):
     41         if not safe:

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in <listcomp>(.0)
     37     # XXX: not handling dictionaries
     38     if estimator_type in (list, tuple, set, frozenset):
---> 39         return estimator_type([clone(e, safe=safe) for e in estimator])
     40     elif not hasattr(estimator, 'get_params'):
     41         if not safe:

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
     37     # XXX: not handling dictionaries
     38     if estimator_type in (list, tuple, set, frozenset):
---> 39         return estimator_type([clone(e, safe=safe) for e in estimator])
     40     elif not hasattr(estimator, 'get_params'):
     41         if not safe:

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in <listcomp>(.0)
     37     # XXX: not handling dictionaries
     38     if estimator_type in (list, tuple, set, frozenset):
---> 39         return estimator_type([clone(e, safe=safe) for e in estimator])
     40     elif not hasattr(estimator, 'get_params'):
     41         if not safe:

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
     49     new_object_params = estimator.get_params(deep=False)
     50     for name, param in six.iteritems(new_object_params):
---> 51         new_object_params[name] = clone(param, safe=False)
     52     new_object = klass(**new_object_params)
     53     params_set = new_object.get_params(deep=False)

C:\Users\Miguel\Anaconda3\lib\site-packages\sklearn\base.py in clone(estimator, safe)
     40     elif not hasattr(estimator, 'get_params'):
     41         if not safe:
---> 42             return copy.deepcopy(estimator)
     43         else:
     44             raise TypeError("Cannot clone object '%s' (type %s): "

C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
    180                             raise Error(
    181                                 "un(deep)copyable object of type %s" % cls)
--> 182                 y = _reconstruct(x, rv, 1, memo)
    183 
    184     # If is its own copy, don't memoize.

C:\Users\Miguel\Anaconda3\lib\copy.py in _reconstruct(x, info, deep, memo)
    296     if state:
    297         if deep:
--> 298             state = deepcopy(state, memo)
    299         if hasattr(y, '__setstate__'):
    300             y.__setstate__(state)

C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
    153     copier = _deepcopy_dispatch.get(cls)
    154     if copier:
--> 155         y = copier(x, memo)
    156     else:
    157         try:

C:\Users\Miguel\Anaconda3\lib\copy.py in _deepcopy_dict(x, memo)
    242     memo[id(x)] = y
    243     for key, value in x.items():
--> 244         y[deepcopy(key, memo)] = deepcopy(value, memo)
    245     return y
    246 d[dict] = _deepcopy_dict

C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
    180                             raise Error(
    181                                 "un(deep)copyable object of type %s" % cls)
--> 182                 y = _reconstruct(x, rv, 1, memo)
    183 
    184     # If is its own copy, don't memoize.

C:\Users\Miguel\Anaconda3\lib\copy.py in _reconstruct(x, info, deep, memo)
    296     if state:
    297         if deep:
--> 298             state = deepcopy(state, memo)
    299         if hasattr(y, '__setstate__'):
    300             y.__setstate__(state)

C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
    153     copier = _deepcopy_dispatch.get(cls)
    154     if copier:
--> 155         y = copier(x, memo)
    156     else:
    157         try:

C:\Users\Miguel\Anaconda3\lib\copy.py in _deepcopy_dict(x, memo)
    242     memo[id(x)] = y
    243     for key, value in x.items():
--> 244         y[deepcopy(key, memo)] = deepcopy(value, memo)
    245     return y
    246 d[dict] = _deepcopy_dict

C:\Users\Miguel\Anaconda3\lib\copy.py in deepcopy(x, memo, _nil)
    172                     reductor = getattr(x, "__reduce_ex__", None)
    173                     if reductor:
-->     174                         rv = reductor(4)
    175                     else:
    176                         reductor = getattr(x, "__reduce__", None)

TypeError: cannot serialize '_io.TextIOWrapper' object

感谢您的时间顺便说一句我在 Windows 10 中工作并更新了所有工具

4

0 回答 0