VBA使用 COM API 使用 OneNote 2013/2016 的图像识别功能

最近由大量的扫描单据须要摘录,就但愿可以经过VBA程序辅助完成这项工做。通过一番检索,在能获取到的主要的OCR产品中,微软产品的识别率相对较高。但目前经常使用的Office 2013和Office 2016  Microsoft OFFICE 2013之后,Microsoft Office Document Imaging就不在支持了,网上可以下载到繁体中文的ODI,但在Windows 10下没法安装。只能在OneNote的图像识别功能了。html

根据网上的文章作了基于VBA的OCR,在编写XML的过程当中颇费了一些周折。根据错误代码判断错误的问题点仍是颇有帮助的。node

https://msdn.microsoft.com/zh-cn/magazine/ff796230.aspxapp

http://www.cnblogs.com/BenAndWang/p/5826634.htmlide

https://msdn.microsoft.com/zh-cn/library/jj680117函数

如下为代码部分:编码

Function GetTextFromSinglePicture(inPicPath As String) As String

    '图片的信息编码,并输出到xml文本中

    Dim xmlDoc As New MSXML2.DOMDocument60
    Dim xmlNode As MSXML2.IXMLDOMNode
    Dim xmlEle As MSXML2.IXMLDOMElement

    Dim picBase64 As imageBase64

    '建立临时的笔记本
    Dim onenoteFullName As String
    With New Scripting.FileSystemObject
        onenoteFullName = .GetSpecialFolder(TemporaryFolder) & "\" & .GetBaseName(.GetTempName) & ".one"
        '判断函数值是否正常
        If .FileExists(inPicPath) = False Then
            GetTextFromPicture = "! Error File Path !"
            Exit Function
        End If
    End With

    Dim onenoteApp As New OneNote.Application
    If onenoteApp Is Nothing Then
        GetTextFromPicture = "! Error in Openning OneNote !"
        GoTo clear_variable_before_exit
    End If
    
    Dim sectionID As String
    Dim pageID As String
    
    Set xmlEle = CreateNotePageContentElement(2, inPicPath)
    
    Set xmlEle = AddNodeInfo(xmlEle)
    
    '建立临时的笔记本,获取sectionID
    onenoteApp.OpenHierarchy onenoteFullName, "", sectionID, cftSection
    
    '建立新的页面,获取pageID
    onenoteApp.CreateNewPage sectionID, pageID, npsBlankPageNoTitle

    '获取页面的XML格式
    Dim pageXmlText As String
    onenoteApp.GetPageContent pageID, pageXmlText, , xs2013
    
    '导入到XML中进行处理,将图片形式导入到XML中
    If xmlDoc.LoadXML(pageXmlText) = False Then
        GetTextFromPicture = "! Error in Loading Xml !"
        GoTo clear_variable_before_exit
    End If
    With xmlDoc.getElementsByTagName("one:Page").Item(0)
        .appendChild xmlEle
    End With
    
    '更新Page内容
    onenoteApp.UpdatePageContent xmlDoc.DocumentElement.xml, , xs2013

    'OneNote识别图片须要时间,如下开始轮询结果,1秒*10次
    Sleep 1000

    Dim iCNT As Integer
    iCNT = 10
re_getPageContent:
    onenoteApp.GetPageContent pageID, pageXmlText, , xs2013
    xmlDoc.LoadXML pageXmlText
    Set xmlEle = xmlDoc.DocumentElement.getElementsByTagName("one:OCRText").Item(0)
    If xmlEle Is Nothing Then
        If iCNT > 0 Then
            Sleep 1000
            iCNT = iCNT - 1
            GoTo re_getPageContent
        Else
            GetTextFromPicture = "! Waiting OneNote Time Expired !"
        End If
    Else
        GetTextFromPicture = xmlEle.Text
    End If
        
clear_variable_before_exit:
    If Not onenoteApp Is Nothing Then
        If Len(pageID) > 0 Then
            onenoteApp.DeleteHierarchy pageID, , True
        End If
        Set onenoteApp = Nothing
    End If
    Kill onenoteFullName
End Function
OneNote识别的VBA主要函数

其中定义了图片Base64类型:spa

Type imageBase64
    base64Text As String
    imageWidth As Long
    imageHeight As Long
End Type

引用了API函数,轮询的时候不会致使程序无响应code

Private Declare PtrSafe Sub Sleep Lib "kernel32" (ByVal dwMilliseconds As Long)

 

Function CreateNotePageContentElement(contentType As Integer, paraContent As String) As MSXML2.IXMLDOMElement
    Dim xmlEle As MSXML2.IXMLDOMElement
    Dim xmlNode As MSXML2.IXMLDOMElement
                
    Dim ns As String
    ns = "one:"
    
    With New MSXML2.DOMDocument60
        Select Case contentType
            Case 1 '文本
                Set xmlNode = .createElement(ns & "T")
                xmlNode.Text = paraContent
            Case 2 '图片
                Dim picBase64 As imageBase64
                picBase64 = getBase64(paraContent)
                
                '建立一个图片XML信息
                
                Set xmlNode = .createElement(ns & "Image")
                xmlNode.setAttribute "format", "jpg"
                xmlNode.setAttribute "originalPageNumber", 0
                
                Set xmlEle = .createElement(ns & "Position")
                xmlEle.setAttribute "x", 0
                xmlEle.setAttribute "y", 0
                xmlEle.setAttribute "z", 0
                xmlNode.appendChild xmlEle
                
                Set xmlEle = .createElement(ns & "Size")
                xmlEle.setAttribute "width", picBase64.imageWidth
                xmlEle.setAttribute "height", picBase64.imageHeight
                xmlNode.appendChild xmlEle
                
                Set xmlEle = .createElement(ns & "Data")
                xmlEle.Text = picBase64.base64Text
                xmlNode.appendChild xmlEle
        End Select
    End With
    Set CreateNotePageContentElement = xmlNode
End Function


Function AddNodeInfo(ContentElement As MSXML2.IXMLDOMElement) As MSXML2.IXMLDOMElement
    Dim xmlEle As MSXML2.IXMLDOMElement
    Dim xmlNode As MSXML2.IXMLDOMElement
    Dim ns As String
    ns = "one:"
    Set xmlNode = ContentElement
    With New MSXML2.DOMDocument60
        Set xmlEle = .createElement(ns & "OE")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
        
        Set xmlEle = .createElement(ns & "OEChildren")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
        
        Set xmlEle = .createElement(ns & "Outline")
        xmlEle.appendChild xmlNode
        Set xmlNode = xmlEle
    End With

    Set AddNodeInfo = xmlNode

End Function
XML处理的函数

 

Function getBase64(inBmpFile As String) As imageBase64
    
    Dim xmlEle As MSXML2.IXMLDOMElement
    With New MSXML2.DOMDocument60
        Set xmlEle = .createElement("Base64Data")
    End With
    xmlEle.DataType = "bin.base64"
    
    With New ADODB.Stream
        .Type = adTypeBinary
        .Open
        .LoadFromFile inBmpFile
        xmlEle.nodeTypedValue = .Read()
        .Close
    End With
    
    getBase64.base64Text = xmlEle.Text
    
    With CreateObject("WIA.ImageFile")
        .loadfile inBmpFile
        getBase64.imageHeight = .Height
        getBase64.imageWidth = .Width
    End With

End Function
图片处理为Base64编码的函数

 

造成VBA模块之后,OCR_Pictures_To_Text函数能够直接在单元格引用,也能够在主程序中引用orm

Sub OCR_Pictures_To_Text()
    Dim vFNi As Variant
    Dim sFNi As Variant
    
    Dim sFNo As String
    
    Dim oTS As TextStream
    
    vFNi = Application.GetOpenFilename("*.jpg,*.jpg", , , , True)
    
    If VarType(vFNi) = vbBoolean Then Exit Sub
    
    sFNo = Application.GetSaveAsFilename(, "*.txt,*.txt")
    
    If sFNo = "False" Then Exit Sub
    
    Dim sTmp As String
    
    With New Scripting.FileSystemObject
        Set oTS = .CreateTextFile(sFNo)
    End With
    For Each sFNi In vFNi
        sTmp = GetTextFromPicture(CStr(sFNi))
        While InStr(1, sTmp, " ") > 0
            sTmp = Replace(sTmp, " ", "")
        Wend
        oTS.Write sTmp
    Next
    oTS.Close
    MsgBox "OK"
End Sub
OCR主程序
相关文章
相关标签/搜索