Data Extraction Workflow File, driving DataScraper's workflow engine, records required workflow processors in sequence for extracting data from the Web pages belonging to a specific theme. The file is in format of XML whose name is suffixed with .profile.xml. The following is a example:
<?xml version="1.0"?>
<geometa-session-profile>
<theme>theme-name</theme> <!-- the theme name -->
<processor id="MigrateWorksBucket">
<class>com.geometa.spider.processor.MigrateWorksBucket</class>
</processor>
<processor id="FetchSpiderClue">
<class>com.geometa.spider.processor.FetchSpiderClue</class>
</processor>
<processor id="LoadHtmlPage">
<class>com.geometa.spider.processor.LoadHtmlPage</class>
</processor>
<processor id="FindDataSchema_Plain">
<class>com.geometa.spider.processor.FindDataSchema_Plain</class>
</processor>
<processor id="ExtractWebNodeData_Simp">
<class>com.geometa.spider.processor.ExtractWebNodeData_Simp</class>
</processor>
<processor id="ValidateExtraction">
<class>com.geometa.spider.processor.ValidateExtraction</class>
</processor>
<processor id="SaveFile_Simp">
<class>com.geometa.spider.processor.SaveFile_Simp</class>
</processor>
<processor id="ExtractSpiderClue_Simp">
<class>com.geometa.spider.processor.ExtractSpiderClue_Simp</class>
</processor>
<processor id="ConfirmSpiderClue_Simp">
<class>com.geometa.spider.processor.ConfirmSpiderClue_Simp</class>
</processor>
<processor id="CleanWorksBucket">
<class>com.geometa.spider.processor.CleanWorksBucket</class>
</processor>
</geometa-session-profile>