1

我有一个自定义 CPF,当一个文档被存储时,它会提取文本并完成注释.. 对于大约 1000 个文档它可以正常工作,然后它会为接下来的几个文档提供错误,然后它又可以正常工作..它给出错误的文件是随机的。以下是错误消息

2016-02-22 14:49:31.580 Debug: Forest::insert: content-repo-content-001-1 XDMP-INMMLISTFULL: In-memory list storage full; list: table=79%, wordsused=76%, wordsfree=0%, overhead=24%; tree: table=3%, wordsused=87%, wordsfree=13%, overhead=0%
2016-02-22 14:49:31.580 Info: Saving /var/opt/MarkLogic/Forests/content-repo-content-001-1/00000070
2016-02-22 14:49:31.595 Debug: InMemoryStand /var/opt/MarkLogic/Forests/content-repo-content-001-1/00000071, disk=10MB, memory=436MB, list=341MB, tree=85MB, rangeIndex=11MB, reverseIndex=11MB, tripleIndex=44MB
2016-02-22 14:49:31.805 Info: content-repo: File Name : S-2010-000029581.pdf
2016-02-22 14:49:31.805 Info: content-repo: Content Type : application/pdf
2016-02-22 14:49:31.805 Info: content-repo: id : 11c565782e85d213ef00bed474bf25ad84d465d3
2016-02-22 14:49:32.023 Debug: Retrying SVCProcess::run 1 because SVC-PROCESSRUN: Process run error: fork: Cannot allocate memory
2016-02-22 14:49:32.204 Debug: Retrying SVCProcess::run 2 because SVC-PROCESSRUN: Process run error: fork: Cannot allocate memory
2016-02-22 14:49:32.437 Debug: Retrying SVCProcess::run 3 because SVC-PROCESSRUN: Process run error: fork: Cannot allocate memory
2016-02-22 14:49:32.765 Debug: Retrying SVCProcess::run 4 because SVC-PROCESSRUN: Process run error: fork: Cannot allocate memory
2016-02-22 14:49:33.298 Debug: Retrying SVCProcess::run 5 because SVC-PROCESSRUN: Process run error: fork: Cannot allocate memory
2016-02-22 14:49:34.224 Debug: Retrying SVCProcess::run 6 because SVC-PROCESSRUN: Process run error: fork: Cannot allocate memory

在 marklogic 中,有没有办法在每次 CPF 操作后清除临时分配的内存?

谢谢

更新

我在想这是由于内存错误和 CPF 没有释放资源..所以我所做的是在我的 CPF 中我执行 xdmp:spawn-function 来做我的注释,认为它将在任务队列中排队。 .但是当我这样做时,我得到以下异常,即使是单个文档

2016-02-23 16:25:50.498 信息:TaskServer:2016-02-23 16:25:50.498 信息:TaskServer:XDMP-CONFLICTINGUPDATES 2016-02-23 16:25:50.498 信息:TaskServer:2016-02-23 16:25:50.498 信息:TaskServer:1.0-ml 2016-02-23 16:25:50.498 信息:TaskServer:冲突更新 2016-02-23 16:25:50.498 信息:TaskServer:XDMP-CONFLICTINGUPDATES:xdmp:document-插入(“/documents/BioEln/de249f6f43d7e6ecdb1a809769852542a944087c.pdf/a...”,

以下是我的代码,我的 CPF 调用此函数repo-lib:transformDoc

(:~
 : Function that transoforms the documents
 :)
declare function repo-lib:transformDoc($uri as xs:string){
    if(repo-lib:isEmpty($uri)) then ()
    else
        (: annotations :)
        let $_ :=  xdmp:spawn-function(function(){repo-lib:loadAnnotatedDoc($uri), xdmp:commit()},
               <options xmlns="xdmp:eval">
                   <transaction-mode>update</transaction-mode>
               </options>)
       (:let $_ := repo-lib:loadAnnotatedDoc($uri) :)
        return ()
};

(:~
 : This function annotates the document..
 :)
declare function repo-lib:loadAnnotatedDoc($uri as xs:string) as xs:string
{
    (: for now we are using termite but we can use anything maybe smartlogic :)
    repo-lib:loadTermiteAnnotations($uri)
};

declare private function repo-lib:deleteDocument($uri as xs:string?)
{
    if($uri) then
        if(fn:exists (fn:doc ($uri))) then
            xdmp:document-delete($uri)
        else ()
    else ()
};

(:~
 : Load termite annotations
 :)
declare function repo-lib:loadTermiteAnnotations($uri as xs:string) as xs:string
{
    if(repo-lib:isEmpty($uri)) then ""
    else
        if(crfslib:uri-exists($uri)) then
            let $newDocUri  := $uri||"/annotatedText/"
            let $id := xdmp:document-properties($uri)//id/text() (:check if the document has id :)
            let $id  := if($id) then $id
                        else xdmp:sha1($newDocUri) (:if not generate a unique id:)
            let $app  := xdmp:document-properties($uri)//context/text() (:check if the document has id :)

            (: This will take care of updates, so delete the old copy :)
            let $annotatedTexLocation := xdmp:document-properties($uri)//annotatedText-location/text()
            let $_ := repo-lib:deleteDocument($annotatedTexLocation)

            let $newDocUri := $newDocUri || $id || ".xml"
            return
                try{
                    let $boundary := "------WebKitFormBoundaryIbhnU2N5CiXjjSU0"
                    let $termite := "http://10.239.12.38:8080/termite"
                    let $dictionary := "ADVENT,ADVENTMED,ANAT,BIOCHEM,BIOPROC,CELLLINE,CHEMBLDRUG,CHEMSTR,COMPANY,COMPOUNDS,DRUG,DRUGFIND, DRUGTYP,INDICATION,PROTYP,LABCHEM,GENE,GENEONT,MOA,ORPHAN,CHEMO,DEVICE,SPECIES"

                    (: Let us get the extracted text :)
                    let $text := xdmp:quote(xdmp:document-filter(fn:doc($uri)))

                    (:Call termite to do the annotations :)
                    let $termite_output := lib-multipart-post:multipart-post($termite, $boundary, (<data name="text">{$text}</data>,
                                                                                            <data name="format">any.xml</data>,
                                                                                            <data name="output">xml</data>,
                                                                                             <data name="entities">{$dictionary}</data>))
                    let $transformed_annotations :=
                        xdmp:xslt-eval(
                            <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
                                <xsl:output method="xml" indent="yes" encoding="utf-8"/>
                                <xsl:strip-space elements="*"/>

                                <xsl:template match="@*|node()">
                                    <xsl:copy>
                                        <xsl:apply-templates select="@*|node()" />
                                    </xsl:copy>
                                </xsl:template>
                                <!-- Apply all child nodes; don't copy the element itself -->
                                <xsl:template match="ResponseBlock|ResponsePayload">
                                    <xsl:apply-templates/>
                                </xsl:template>
                                <!-- Drop elements -->
                                <xsl:template match="DictSynList|MatchedSynonyms|TaxonomyList|Taxonomy|MetaHeader|Source|HitLocations|HitLocationString|Exact|TotalSyns|HitCount|Score|NonAmbigSyns|Fragments|KeyValuePairs|Mappings"/>

                            </xsl:stylesheet>,$termite_output[2]
                        )

                    (: delete the document if exisists :)
                    let $_ := repo-lib:deleteDocument($newDocUri)
                    (: let us add and annotatons :)
                    let $_ := xdmp:document-insert($newDocUri, $transformed_annotations,(xdmp:default-permissions()), ("annotated", "termite", $app))
                    (: let us add the propertier :)
                    let $_ := xdmp:document-add-properties($newDocUri, (<document-parent-location>{$uri}</document-parent-location>,
                                                                        <context>{$app}</context>,
                                                                        <id>{$id}</id>))
                    let $log := xdmp:log("Dne inserting the document")


                    (: let $_ := xdmp:commit() :)
                    (: add the extracgted text as well :)
                    (:) let $_ := xdmp:document-add-properties($uri, (<annotatedText-location>{$newDocUri}</annotatedText-location>)) :)
                    return $newDocUri
                }catch($ex) {
                    let $log := xdmp:log("ERROR !!!!!")
                    let $log := xdmp:log($ex)
                    return ""
                }
        else
            let $log := xdmp:log("ERRRO!!!! - Document does not exisist yet")
            return ""
};
4

1 回答 1

1

我认为您试图在一项公积金行动中做得太多。CPF 旨在作为一个状态引擎,其中文档缓慢地通过各种状态,多个动作促成小的变化。

为此,CPF 将使用 cpf: 命名空间中的文档属性来跟踪状态和进度。我认为这就是您遇到更新冲突的原因。不要使用 spawn,或者将工作拆分为多个操作,或者考虑使用简单的触发器一次完成所有操作。

于 2016-02-25T10:25:42.133 回答