0

我们的应用程序中的步骤是:

  1. 正在获取推文并将其存储在 MongoDB 集合中。搜索推文
  2. 从 search_tweets 读取数据,计算几个参数并将结果存储在不同的集合中。Twitter_Processed。这个系列的没有少得多。键,其中大部分是计算字段

示例文档 - search_tweets

{
  "_id" : ObjectId("5372fae4e4b0e6088a20a4f4"),
  "contributors" : null,
  "text" : "Stay far from #Massachusetts General Hospital. It secretly imposes #unilateral #DNRs against patient & family wishes. http://t.co/hDuK96gqyU",
  "geo" : null,
  "retweeted" : false,
  "in_reply_to_screen_name" : null,
  "possibly_sensitive" : false,
  "truncated" : false,
  "lang" : "en",
  "entities" : {
    "symbols" : [],
    "urls" : [{
        "expanded_url" : "http://www.nationalreview.com/human-exceptionalism/377745/unilateral-non-resuscitation-massgeneral-wesley-j-smith",
        "indices" : [122, 144],
        "display_url" : "nationalreview.com/human-exceptio…",
        "url" : "http://t.co/hDuK96gqyU"
      }],
    "hashtags" : [{
        "text" : "Massachusetts",
        "indices" : [14, 28]
      }, {
        "text" : "unilateral",
        "indices" : [67, 78]
      }, {
        "text" : "DNRs",
        "indices" : [79, 84]
      }],
    "user_mentions" : []
  },
  "in_reply_to_status_id_str" : null,
  "id" : NumberLong("466053869173088258"),
  "source" : "web",
  "in_reply_to_user_id_str" : null,
  "favorited" : false,
  "in_reply_to_status_id" : null,
  "retweet_count" : 0,
  "created_at" : "Tue May 13 03:14:35 +0000 2014",
  "in_reply_to_user_id" : null,
  "favorite_count" : 0,
  "id_str" : "466053869173088258",
  "place" : null,
  "user" : {
    "location" : "Gainesville, VA",
    "default_profile" : false,
    "profile_background_tile" : true,
    "statuses_count" : 3978,
    "lang" : "en",
    "profile_link_color" : "E82012",
    "profile_banner_url" : "https://pbs.twimg.com/profile_banners/137925469/1353178590",
    "id" : 137925469,
    "following" : false,
    "protected" : false,
    "favourites_count" : 156,
    "profile_text_color" : "E0B392",
    "description" : "Saved by faith in Jesus and ministering at nursing homes with my wife, Germaine.  We pray for the abolition of abortion and all forms of slavery (trafficking).",
    "verified" : false,
    "contributors_enabled" : false,
    "profile_sidebar_border_color" : "FFFFFF",
    "name" : "Lawrence Sylvain",
    "profile_background_color" : "F9F4E1",
    "created_at" : "Wed Apr 28 04:24:00 +0000 2010",
    "is_translation_enabled" : false,
    "default_profile_image" : false,
    "followers_count" : 157,
    "profile_image_url_https" : "https://pbs.twimg.com/profile_images/3150852957/fdfc37190dec1e382a7fe6489a611399_normal.jpeg",
    "geo_enabled" : false,
    "profile_background_image_url" : "http://pbs.twimg.com/profile_background_images/714440493/0d3e10e0aee5e662fcf3a6a5a686c541.png",
    "profile_background_image_url_https" : "https://pbs.twimg.com/profile_background_images/714440493/0d3e10e0aee5e662fcf3a6a5a686c541.png",
    "follow_request_sent" : false,
    "entities" : {
      "description" : {
        "urls" : []
      },
      "url" : {
        "urls" : [{
            "expanded_url" : "http://www.LJSConsulting.com/",
            "indices" : [0, 22],
            "display_url" : "LJSConsulting.com",
            "url" : "http://t.co/37QKbkdxpa"
          }]
      }
    },
    "url" : "http://t.co/37QKbkdxpa",
    "utc_offset" : -18000,
    "time_zone" : "Lima",
    "notifications" : false,
    "profile_use_background_image" : true,
    "friends_count" : 331,
    "profile_sidebar_fill_color" : "BE9E78",
    "screen_name" : "ljsylvain",
    "id_str" : "137925469",
    "profile_image_url" : "http://pbs.twimg.com/profile_images/3150852957/fdfc37190dec1e382a7fe6489a611399_normal.jpeg",
    "listed_count" : 0,
    "is_translator" : false
  },
  "coordinates" : null,
  "metadata" : {
    "result_type" : "recent",
    "iso_language_code" : "en"
  },
  "token" : {
    "CONSUMER_SECRET" : "4FsHhU2KuYsbowCJsuZ4RtsUq4rpLQQcQAGeXkIZqY",
    "ACCESS_TOKEN" : "2362487558-EflbK1NLJMjhAQnXQHkmRwMCwqBBlZ2Y0KRnydf",
    "CONSUMER_KEY" : "cmZaUbCeDpb9SWCwUlCNsA",
    "ACCESS_SECRET" : "gNvLl6n3gXwXBl7GZfHpmUMreqNn1OULz30SpT29jAYwy",
    "APP_ID" : "3"
  },
  "searchProfileId" : 905,
  "customerId" : 0,
  "schedularId" : 32446,
  "userId" : "395",
  "subject" : "Massachusetts General Hospital ",
  "context" : "Healthcare",
  "tagId" : 0,
  "domain" : "General",
  "uniqueId" : 933,
  "message" : "Stay far from #Massachusetts General Hospital. It secretly imposes #unilateral #DNRs against patient & family wishes. http://t.co/hDuK96gqyU",
  "searchkeyword" : "Massachusetts General Hospital ",
  "connectortype" : "TWITTER"
}

示例文档 - Twitter_Processed

{
  "_id" : {
    "SpId" : 905,
    "Channel_Id" : 0,
    "ActivityId" : "137925469",
    "SchedularId" : 32446,
    "UniqueID" : 933
  },
  "Cust_Id" : " 0",
  "Domain_Id" : " General",
  "searchkeyword" : " Massachusetts General Hospital ",
  "Sentiment" : 0.42,
  "Intention_cause" : "None",
  "Intention_category" : "None",
  "Emotion_category" : "neutral",
  "AgeGroup" : "26-35",
  "Gender" : "M",
  "Location" : {
    "Country" : "United Kingdom",
    "Lat" : "54.7136300",
    "Long" : "-6.2142800",
    "City" : "Northern Ireland"
  },
  "message" : "Stay far from Massachusetts General Hospital. It secretly imposes unilateral DNRs against patient &amp family wishes. httpt.cohDuK96gqyU",
  "PostDate" : ISODate("2014-05-12T21:44:35Z"),
  "DetectedLanguage" : "English",
  "ProcessedText" : "%*Stay*% far from Massachusetts General Hospital . ==0.01:::It secretly imposes unilateralDNRs against %*patient*% &amp family wishes . ==1.25:::httpt.cohDuK96gqy U ==0:::",
  "IdeaCloudText" : " PS_stay PS_patient",
  "IdeaCloudTopicSpoken" : "  Massachusetts General unilateralDNRs patient family",
  "userid" : " ljsylvain",
  "followers" : 157,
  "like_count" : 0,
  "plusone_count" : 0
}

阅读文档后,我应该如何构建索引/几个索引:

  • 在 find/$match 中,使用键“searchProfileId”和“schedularId”
  • 返回键“text”、“user”(有几个子文档)、“searchProfileId”、“schedularId”和“uniqueId”、“id_str”
  • “文本”的推文是随机的 - 将其包含在复合索引中是否有意义?如果没有,是否会以任何方式构建文本索引帮助(在“文本”的值内不进行文本搜索)

基本上,我想确保查询被覆盖。

4

0 回答 0