1

使用 CachedSqlEntityProcessor 时完全导入失败,出现异常

java.lang.OutOfMemoryError:超出 GC 开销限制

我该如何解决这个问题.......

不使用 CachedSqlEntityProcessor 需要 15 小时才能建立索引

我的 products-data-config.xml 是

<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/localbazaar" user="root" password="sa" batchSize="100" />
    <document name="products">
        <entity name="domainProduct" query="SELECT p.PRODUCT_ID, p.NAME, LOWER(REPLACE(REPLACE(p.NAME,' ','-'),'/','-')) AS purl, p.description, p.BRAND_ID, p.CATEGORY_ID, p.GROUP_ID, p.MIN_PRICE, p.MAX_PRICE, p.AUTHOR, p.ISBN10, p.ISBN13, p.OLID, p.EAN13, p.UPCA, p.SKU, p.LANGUAGE, p.FORMAT, p.PUBLISHER, p.SUBJECT, c.NAME AS cname, c.URL_NAME, b.NAME AS bname, LOWER(REPLACE(REPLACE(b.NAME,' ','-'),'/','-')) AS bUrl, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',i.IMAGE_NAME) AS productImage FROM product_t p LEFT OUTER JOIN category_t c ON (c.CATEGORY_ID=p.CATEGORY_ID) LEFT OUTER JOIN brand_t b ON (b.BRAND_ID=p.BRAND_ID) LEFT OUTER JOIN image_t i ON (i.ASSET_ID=p.PRODUCT_ID AND i.ASSET_TYPE_ID = 4 AND i.IMAGE_TYPE_ID = 0)">
            <field column="PRODUCT_ID" name="productId" />
            <field column="NAME" name="productName" />
            <field column="purl" name="productUrlName" />
            <field column="description" name="productDescription" />
            <field column="BRAND_ID" name="brandId" />
            <field column="CATEGORY_ID" name="categoryId" />
            <field column="GROUP_ID" name="groupId" />
            <field column="MIN_PRICE" name="minPrice" />
            <field column="MAX_PRICE" name="maxPrice" />
            <field column="AUTHOR" name="author" />
            <field column="ISBN10" name="isbn10" />
            <field column="ISBN13" name="isbn13" />
            <field column="OLID" name="olid" />
            <field column="EAN13" name="ean13" />
            <field column="UPCA" name="upca" />
            <field column="SKU" name="sku" />
            <field column="LANGUAGE" name="language" />
            <field column="FORMAT" name="format" />
            <field column="PUBLISHER" name="publisher" />
            <field column="SUBJECT" name="subject" />
            <field column="cname" name="categoryName" />
            <field column="URL_NAME" name="categoryUrlName" />
            <field column="bname" name="brandName" />
            <field column="bUrl" name="brandUrlName" />
            <field column="productImage" name="productImage" />
            <entity name="specifications" query="select PRODUCT_ID, CONCAT(PROPERTY_NAME,':::',property_value) as specifications FROM product_properties_t " processor="CachedSqlEntityProcessor" where="PRODUCT_ID=domainProduct.PRODUCT_ID" />
        </entity>
    </document>
</dataConfig>

我的 store-products-data-config.xml 是

<dataConfig>
    <dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/localbazaar" user="root" password="sa" batchSize="100" />
    <document name="products">
        <entity name="domainStoreProduct" query="SELECT sp.STORE_PRODUCT_ID, sp.STORE_ID, sp.PRODUCT_ID, sp.MIN_PRICE, sp.MAX_PRICE, sp.STORE_TYPE_ID, sp.BUY_X, sp.GET_Y, s.NAME AS sname, LOWER(REPLACE(REPLACE(s.NAME,' ','-'),'/','-')) AS sUrl, s.DESCRIPTION AS sdesc, s.WEB_SITE_UTL, s.EMAIL, s.PHONE, s.MOBILE, s.ACTIVE AS act, a.ADDRESS_ID, a.location, LOWER(REPLACE(REPLACE(a.location,' ','-'),'/','-')) AS urlLoc, a.ADDRESS_LINE1, a.ADDRESS_LINE2, a.LATITUDE, a.LONGITUDE, a.zipcode, a.LANDMARK, a.CITY, CONCAT(a.LATITUDE,',',a.LONGITUDE) AS ll, p.NAME AS pname, LOWER(REPLACE(REPLACE(p.NAME,' ','-'),'/','-')) AS purl, p.description AS pdesc, p.BRAND_ID, p.CATEGORY_ID, p.GROUP_ID, p.AUTHOR, p.ISBN10, p.ISBN13, p.OLID, p.EAN13, p.UPCA, p.SKU, p.LANGUAGE, p.FORMAT, p.PUBLISHER, p.SUBJECT, c.NAME AS cname, c.URL_NAME, b.NAME AS bname, LOWER(REPLACE(REPLACE(b.NAME,' ','-'),'/','-')) AS bUrl, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',ip.IMAGE_NAME) AS pImage, CONCAT('http://partnercenter.localbazaar.com/image?imageId=',ist.IMAGE_NAME) AS sImage, ci.CITY_ID FROM store_products_t sp LEFT OUTER JOIN store_t s ON (sp.STORE_ID=s.STORE_ID) LEFT OUTER JOIN address_t a ON (a.ASSET_TYPE_ID=3 AND a.ASSET_ID=sp.STORE_ID) LEFT OUTER JOIN product_t p ON (p.PRODUCT_ID=sp.PRODUCT_ID) LEFT OUTER JOIN category_t c ON (c.CATEGORY_ID=p.CATEGORY_ID) LEFT OUTER JOIN brand_t b ON (b.BRAND_ID=p.BRAND_ID) LEFT OUTER JOIN image_t ip ON (ip.ASSET_ID=sp.PRODUCT_ID AND ip.ASSET_TYPE_ID=4 AND ip.IMAGE_TYPE_ID=0) LEFT OUTER JOIN image_t ist ON (ist.ASSET_ID=sp.STORE_ID AND ist.ASSET_TYPE_ID=3 AND ist.IMAGE_TYPE_ID=0) LEFT OUTER JOIN city_t ci ON (ci.NAME=a.CITY)">
            <field column="STORE_PRODUCT_ID" name="storeProductId" />
            <field column="STORE_ID" name="storeId" />
            <field column="PRODUCT_ID" name="productId" />
            <field column="MIN_PRICE" name="storeMinPrice" />
            <field column="MAX_PRICE" name="storeMaxPrice" />
            <field column="STORE_TYPE_ID" name="storeTypeId" />
            <field column="BUY_X" name="buyX" />
            <field column="GET_Y" name="getY" />
            <field column="sname" name="storeName" />
            <field column="sUrl" name="storeUrlName" />
            <field column="sdesc" name="description" />
            <field column="WEB_SITE_UTL" name="webSiteUrl" />
            <field column="EMAIL" name="email" />
            <field column="PHONE" name="phone" />
            <field column="MOBILE" name="mobile" />
            <field column="act" name="active" />
            <field column="ADDRESS_ID" name="addressId" />
            <field column="location" name="location" />
            <field column="urlLoc" name="urlLocation" />
            <field column="ADDRESS_LINE1" name="addressLine1" />
            <field column="ADDRESS_LINE2" name="addressLine2" />
            <field column="LATITUDE" name="latitude" />
            <field column="LONGITUDE" name="longitude" />
            <field column="zipcode" name="zipcode" />
            <field column="LANDMARK" name="landmark" />
            <field column="CITY" name="city" />
            <field column="ll" name="latlong" />
            <field column="pname" name="productName" />
            <field column="purl" name="productUrlName" />
            <field column="pdesc" name="productDescription" />
            <field column="BRAND_ID" name="brandId" />
            <field column="CATEGORY_ID" name="categoryId" />
            <field column="GROUP_ID" name="groupId" />
            <field column="AUTHOR" name="author" />
            <field column="ISBN10" name="isbn10" />
            <field column="ISBN13" name="isbn13" />
            <field column="OLID" name="olid" />
            <field column="EAN13" name="ean13" />
            <field column="UPCA" name="upca" />
            <field column="SKU" name="sku" />
            <field column="LANGUAGE" name="language" />
            <field column="FORMAT" name="format" />
            <field column="PUBLISHER" name="publisher" />
            <field column="SUBJECT" name="subject" />
            <field column="cname" name="categoryName" />
            <field column="URL_NAME" name="categoryUrlName" />
            <field column="bname" name="brandName" />
            <field column="bUrl" name="brandUrlName" />
            <field column="pImage" name="productImage" />
            <field column="sImage" name="storeImage" />
            <field column="CITY_ID" name="cityId" />
            <entity name="specifications" query="select PRODUCT_ID, CONCAT(PROPERTY_NAME,':::',property_value) as specifications FROM product_properties_t " processor="CachedSqlEntityProcessor" WHERE="PRODUCT_ID= domainStoreProduct.PRODUCT_ID" />
        <entity name="storeProperties" query="select STORE_ID, CONCAT(PROPERTY_ID,':::',PROPERTY_VALUE) as storeProperties FROM store_properties_t " processor="CachedSqlEntityProcessor" WHERE="STORE_ID=domainStoreProduct.STORE_ID" />
        </entity>
    </document>
</dataConfig>
4

1 回答 1

2

你可以尝试不同的东西:

  1. 尝试设置batchSize属性。如果您正确调整它,您可以提高数据源的性能。
  2. SELECT *总是比提供您需要的慢(即使您需要所有列)。我建议使用SELECT PRODUCT_ID, NAME, ...而不是使用*
  3. 为什么有实体 b、i 和 s?您不使用其中的字段,因此我认为它们不是很有用
  4. 尝试为您的子实体使用CachedSqlEntityProcessor。它只会检索一次数据并为每个子实体重新使用它。
  5. 您的产品是否可以属于 1 个以上的类别(它是多值字段吗?),如果不是,那么使用JOINS编写一个查询比编写多个实体更快。

编辑:我建议将这件事分成 2 个问题,因为现在其他人用我的旧答案阅读你的新问题真的很奇怪。

我不认为你可以选择 CachedSqlEntityProcessor 将他的缓存放在哪里(我认为它总是在内存中)。您的 8 小时数据导入的问题在于,因为我们正在谈论大量记录,所以将使用大量查询(每个子实体都使用自己的查询)。

您的问题的解决方案是删除子实体,并在您的父实体中将子实体的查询添加为逗号分隔列表。我建议看看这个答案

如果您这样做,您的所有规范(例如)将作为逗号分隔的列表存储在一列中。然后,您可以使用 Solr ScriptTransformer 拆分值并创建多个值。

这将查询的数量限制为 1 个大查询,并且还会限制 RAM 的使用,因为它将单独解析每个查询。我不知道性能会如何,因为您必须单独解析每个实体。

如果这不起作用,我认为没有比等待 8 小时完成数据导入更好的解决方案了。您不能指望 Solr 会在 1 2 3 中将其全部索引。您可以尝试使用 cronjob 在夜间运行此任务。

于 2013-04-25T13:57:30.153 回答