0

I am trying to scrape a pdf with tables using python and the tabula package. In some cases, two columns are being extracted completely mixed up. I know that the column "Type" Should only have these two values: EE-Male or EE-Female. Thus, I need to remove all the extra letters in column "Type" and put them at the end of column "Name" in the exact order that they appear.

Name                        Type
CHAK NO.162 NB PURANI AB    AEDEI-Male
EXCELLENT (ATTACH WITH GC   EEET-)M JaEleHLUM
PIND KHAN (TRATANI SAMAN    EDE) -Female
BASTI JAM SUMMAR PO RUKA    NEEP-UMRale
BASTI QAZIAN P/O KHANBEL    AEE-Female
GHAUS PUR MACHIAN PO RU     EKEA-FNe PmUaRle
NOOR MUHAMMAD CHEENR        AELE W-FAemLAale
PHATHI THARO KHELAN WAL     EI E-Female
WAH SAIDAN PO DAJAL RANJA   ENE P-MUaRle

As a result I would need to have these two columns:

Name                                  Type
CHAK NO.162 NB PURANI ABADI           EE-Male
EXCELLENT (ATTACH WITH GCET) JEHLUM   EE-Male
PIND KHAN (TRATANI SAMAND)            EE-Female
BASTI JAM SUMMAR PO RUKANPUR          EE-Male
BASTI QAZIAN P/O KHANBELA             EE-Female
GHAUS PUR MACHIAN PO RUKAN PUR        EE-Female
NOOR MUHAMMAD CHEENRAL WALA           EE-Female
PHATHI THARO KHELAN WALI              EE-Female
WAH SAIDAN PO DAJAL RANJAN PUR        EE-Male

Any suggestion? Thanks!

4

2 回答 2

0

Where / how exactly do you want to do this? Since tabula is a Java library, I'm assuming you want to use Java. So here is one way to do it, though it is not the most elegant:

import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Main {

    public static String fixMixedText(String text) {
        String[] rows = text.split("\n");
        String[] newRows = new String[rows.length];

        String mString = "EE-Male";
        String fString = "EE-Female";

        String mRegex = "(.*)" + String.join("(.*)", mString.split("")) + "(.*)";
        String fRegex = "(.*)" + String.join("(.*)", fString.split("")) + "(.*)";


        for (int i = 0; i < rows.length; ++i) {
            String[] cols = rows[i].split("\\s{2,}"); // 2 or more whitespaces
            assert(cols.length == 2);
            String[] newCols = new String[2];

            if (i == 0) {
                newRows[i] = String.join("\t", cols);
                // don't do any more processing than this for header row
                continue;
            }
            
            Matcher m = Pattern.compile(fRegex).matcher(cols[1]);

            boolean isFemaleMatch = m.find();

            if (!isFemaleMatch) {
                m = Pattern.compile(mRegex).matcher(cols[1]);
                if (!m.find()) {
                    // no matches of either type
                    continue;
                }
            }

            newCols[1] = isFemaleMatch ? fString : mString;
            StringBuilder sb = new StringBuilder();
            for (int matchIdx = 1; matchIdx <= m.groupCount(); ++matchIdx) {
                // start loop at 1 because group(0) returns entire match
                sb.append(m.group(matchIdx));
            }
            newCols[0] = cols[0] + sb.toString();
            newRows[i] = String.join("\t", newCols);
        }

        return String.join("\n", newRows);
    }

    public static void main(String... args) {

        String origText = "Name                        Type\n" +
                "CHAK NO.162 NB PURANI AB    AEDEI-Male\n" +
                "EXCELLENT (ATTACH WITH GC   EEET-)M JaEleHLUM\n" +
                "PIND KHAN (TRATANI SAMAN    EDE) -Female\n" +
                "BASTI JAM SUMMAR PO RUKA    NEEP-UMRale\n" +
                "BASTI QAZIAN P/O KHANBEL    AEE-Female\n" +
                "GHAUS PUR MACHIAN PO RU     EKEA-FNe PmUaRle\n" +
                "NOOR MUHAMMAD CHEENR        AELE W-FAemLAale\n" +
                "PHATHI THARO KHELAN WAL     EI E-Female\n" +
                "WAH SAIDAN PO DAJAL RANJA   ENE P-MUaRle";

        String fixedText = fixMixedText(origText);
        System.out.println(fixedText);

        /*
        Name    Type
        CHAK NO.162 NB PURANI ABADI EE-Male
        EXCELLENT (ATTACH WITH GCET) JEHLUM EE-Male
        PIND KHAN (TRATANI SAMAND)  EE-Female
        BASTI JAM SUMMAR PO RUKANPUR    EE-Male
        BASTI QAZIAN P/O KHANBELA   EE-Female
        GHAUS PUR MACHIAN PO RUKAN PUR  EE-Female
        NOOR MUHAMMAD CHEENRAL WALA EE-Female
        PHATHI THARO KHELAN WALI    EE-Female
        WAH SAIDAN PO DAJAL RANJAN PUR  EE-Male
        */
    }
}
于 2020-06-26T07:36:18.740 回答
0

Here is a solution that worked for me using python:

categories = ["EE-Male", "EE-Female"]
#Create a dictionary with categories as keys and a regular expression as values.
categories_regex = {}
for category in categories:
    categories_regex[category] = ".*" + ".*".join(list(category)) + ".*"

df['type'] = df.apply(lambda row : clean_categorical_var(row['type'], categories, categories_regex), axis = 1) 
df['name']  = df.apply(lambda row : clean_name_var(row, 'type', 'name', categories, 'type2'), axis = 1) 
df.drop(labels=["type"], axis=1, inplace = True) 
df.rename(columns={"type2":"type"}, inplace = True)

And I used the following three auxiliary functions:

def clean_categorical_var(categorical_cell, categories, categories_regex):
    '''
    Cleans a categorical variable cell such as the type variable. 
    Input:
        categorical_cell (str): content of the categorical cell tu clean
        categories (list): list with the values (str) supposed to find on the
                           categorical column (ex. EE-Male, EE-Female)
        categories_regex (dic): categories as keys and a regular expression for 
                                each category as values.
    Output:
        cleaned_category (str): cleaned category without the mixed letters.
    '''
    cleaned_category = np.nan
    for category in categories:
        regex = categories_regex[category]
        if re.match(regex, categorical_cell):
            cleaned_category = category

    return cleaned_category


def remove_letters(category, string_to_clean):
    '''
    Removes the letters on the category to recover the letters missing on the previous column. 
    Input:
        categories (list): list with the values (str) supposed to find on the 
                           categorical column (ex. EE-Male, EE-Female)
        string_to_clean (str): categorical column dirty from where to recover the missing letters
    Output:
        cleaned_name (str): cleaned name with the letters that were missing at the end.
    '''
    category = list(category) 
    letters_index_to_delete = []
    for n, letter in enumerate(list(string_to_clean)):
            if letter == category[0]:
                letters_index_to_delete.append(n)
                del category[0]
                if not category:
                    break
    return letters_index_to_delete


def clean_name_var(row, categorical_column, name_column, categories, 
                   categorical_column2='categorical_column_cleaned'):
    '''
    Cleans a name variable adding the letters that were missing at the end. 
    Input:
        row (df.row): The row from the df to be cleaned
        categorical_column (str): name of the column with the categories (ex. type)
        name_column (str): name of the column to be cleaned
        categories (list): list with the values (str) supposed to find on the 
                           categorical column (ex. EE-Male, EE-Female)
        categorical_column2 (str): name of the column with the categories cleaned (ex. type)
    Output:
        cleaned_name (str): cleaned name with the letters that were missing at the end.
    '''

    letters_index_to_delete = []
    col_name_end = list(row[categorical_column])
    if row[categorical_column] in categories:
        return row[name_column]
    for category in categories:
        if row[categorical_column2] == category:
            letters_index_to_delete = remove_letters(category, row[categorical_column])
            break 
    for n in sorted(letters_index_to_delete, reverse=True):
        del col_name_end[n]

    return row[name_column]+''.join(col_name_end)
于 2020-07-03T17:03:44.350 回答