我是非常新的谷歌脚本。我有一些pdf文件在一个文件夹上的谷歌驱动器,我试图转换pdf到谷歌文档,并提取特定的文本。PDF有200多页,但即使是google.doc文件也限制在80页以内。您可以运行OCR的页数有限制吗?或者我错过了什么..。
我的代码如下:
//#全球#
const FOLDER_ID = "1rlAL4WrnxQ6pEY2uOmzWA_csUIDdBjVK"; //Folder ID of all PDFs
const SS = "1XS_YUUdu9FK_bBumK3lFu9fU_M9w7NGydZqOzu9vTyE";//The spreadsheet ID表=“提取”;//工作表选项卡名
/*########################################################
documents.
中学生和部分的列表。
*/
function extractInfo(){
const ss = SpreadsheetApp.getActiveSpreadsheet()
//Get all PDF files:
const folder = DriveApp.getFolderById(FOLDER_ID);
//const files = folder.getFiles();
const files = folder.getFilesByType("application/pdf");
let allInfo = []
//Iterate through each folderr
while(files.hasNext()){
Logger.log('first call');
let file = files.next();
let fileID = file.getId();
const doc = getTextFromPDF(fileID);
const invDate = extractInvDate(doc.text);
allInfo = allInfo.concat(invDate);
Logger.log("Length of allInfo array: ")
Logger.log(allInfo.length);
}
importToSpreadsheet(allInfo); //this is 80, even though pdf
//has more than 200 pages with
//required text (invoice date) on each page
};
/*########################################################
* Extracts the text from a PDF and stores it in memory.
* Also extracts the file name.
*
* param {string} : fileID : file ID of the PDF that the text will be extracted from.
*
* returns {array} : Contains the file name and PDF text.
*
*/
function getTextFromPDF(fileID) {
var blob = DriveApp.getFileById(fileID).getBlob()
var resource = {
title: blob.getName(),
mimeType: blob.getContentType()
};
var options = {
ocr: true,
ocrLanguage: "en"
};
// Convert the pdf to a Google Doc with ocr.
var file = Drive.Files.insert(resource, blob, options);
// Get the texts from the newly created text.
var doc = DocumentApp.openById(file.id);
var text = doc.getBody().getText();
var title = doc.getName();
// Deleted the document once the text has been stored.
Drive.Files.remove(doc.getId());
return {
name:title,
text:text
};
}
function extractInvDate(text){
const regexp = /Invoice Date:/g;//commented out \d{2}\/\d{2}\/\d{4}/gi;
try{
let array = [...text.match (regexp)];
return array;
}catch(e){
}
};
function importToSpreadsheet(data){
const sheet = SpreadsheetApp.openById(SS).getSheetByName(SHEET);
const range = sheet.getRange(3,1,data.length,1);
var j = 0;
for (j = 0; j < data.length; j++){
Logger.log(j);
range.getCell(j+1,1).setValue(data[j]);
}
//range.sort([2,1]);
}发布于 2022-04-19 13:47:10
问题或限制在于Drive.Files.insert函数。
提取blob时,获取字符串,但是它有mime细节,also...one可能需要处理它。示例代码如下。根据您的需要修改
var blob = DriveApp.getFileById(fileID).getBlob()
var txt = blob.getDataAsString()https://stackoverflow.com/questions/67822036
复制相似问题