hutch
2008-11-03 02:22:53 UTC
In a posting on March 19, Rob Chambers responded to a couple of people
asking for help on training a profile using speech files rather than
the training UI. He pointed at ISpRecognizer2.SetTrainingState and
ISpTranscript.GetTranscript. That suggestion and the documentation of
the sapi 5.3 provide a starting place but leave a lot of questions
about how to execute a solution, and I’ve been banging (hacking?) away
at this problem for a couple of weeks.
I’ve included code below that seems like it should do the trick, but
it doesn’t. The sapi 5.3 documentation says that
ISpRecognizer2.SetTrainingState(newState,retainTraining) puts the
system into the same state as the training UI. I call
SetTrainingState(TRUE,TRUE), bind an ISpStream to an audio file that
either has a transcript in the file or I add the correct text using
AppendTranscript, then set the recognizer’s input to that stream and
activate recognition. The recognizer performs recognition and makes
mistakes, and according to the documentation it seems like it should
be adapting the audio and language models. But if I send the same file
again immediately, it makes exactly the same mistakes. If after the
first recognition I send SetTrainingState(FALSE,TRUE) to stop training
and save the adaptation, then test it again, it still makes the same
mistakes! So there must be something more.
Another strategy is to use ISpRecoResult2.CommitText, which the sapi
5.3 documentation says will train the acoustic and/or language model
to adapt to the correct text that is given as an argument to that
method. That has the effect of creating 3 new profile files in the
MSASR folder, with the same .dld, _1.ngr, and .tbp suffixes as the
previous files but with the index incremented (e.g. 1033/12/L1033.dld
instead of 1033/11/L1033.dld). The 2 audio model files are unchanged.
But if I send the same audio file again, it still makes exactly the
same mistakes!
Whether I try these two approaches separately or together, they still
do not work.
Rob (or someone), please help! I have done a lot of work and am
sharing the code, so I hope I will get some help about how to make
this work—it’s urgent. I’m hoping it’s a small detail. (P.s.—what is
the index /12/ in the 1033/12/L1033 files?)
Bill Hutchison
#include "stdafx.h"
#include "sphelper.h"
#include <sapi.h>
#include <string.h>
//MAIN() is last function below
inline HRESULT ReturnResult(ISpRecoContext * pRecoCtxt, ISpRecoResult
** ppResult)
{
HRESULT hr = S_OK;
CSpEvent spEvent;
while (S_OK == pRecoCtxt->WaitForNotifyEvent(INFINITE))
{
while (S_OK == spEvent.GetFrom(pRecoCtxt))
{
switch (spEvent.eEventId)
{
case SPEI_RECOGNITION:
*ppResult = spEvent.RecoResult();
if (*ppResult)
{
(*ppResult)->AddRef();
}
return hr;
case [OTHER EVENTS]
spEvent.Clear();
}
return hr;
}
inline HRESULT TrainOneFile(ISpRecoContext * cpRecoCtxt, ISpRecognizer
* cpRecognizerBase, ISpRecoGrammar * cpGrammar)
{
CComPtr<ISpStream> cpStream;
CComPtr<ISpRecoResult> cpResult;
CComPtr<ISpTranscript> cpTranscript;
PWCHAR pwszTranscript;
HRESULT hr = S_OK;
hr = cpStream.CoCreateInstance(CLSID_SpStream);
// Bind a stream to an existing wavefile
if (SUCCEEDED(hr)) {
hr = cpStream->BindToFile(L"C:\\XX.wav", SPFM_OPEN_READONLY,
NULL,
NULL,
SPFEI_ALL_EVENTS);
}
if (SUCCEEDED(hr)){
hr = cpStream.QueryInterface(&cpTranscript);
}
if (SUCCEEDED(hr)) {
hr = cpTranscript->GetTranscript(&pwszTranscript);
}
//THIS IS ALTERNATE CODE FOR PREVIOUS LINE, FOR SOUND FILES THAT
DON’T HAVE A TRANSCRIPT ATTACHED
LPCWSTR sCorrectText = L"Anyone who has spent time on a farm knows
there is a rhythm to the year.";
if (SUCCEEDED(hr)){
hr = cpTranscript->AppendTranscript(s);
}
if (SUCCEEDED(hr)) {
hr = cpTranscript->GetTranscript(&pwszTranscript);
}
if(SUCCEEDED(hr)){
hr = cpRecognizerBase->SetInput(cpStream, TRUE);
}
USES_CONVERSION;
CSpDynamicString dstrText;
if (SUCCEEDED (hr)){
hr = cpGrammar->SetDictationState(SPRS_ACTIVE);
}
if (SUCCEEDED(hr)){
hr = ReturnResult(cpRecoCtxt, &cpResult);
}
if (SUCCEEDED(hr)){
hr = cpGrammar->SetDictationState( SPRS_INACTIVE );
}
if ((cpResult) &&(SUCCEEDED(hr))){
hr = cpResult-
CComPtr<ISpRecoResult2> cpResult2;
if (SUCCEEDED(hr)){
hr = cpResult.QueryInterface<ISpRecoResult2>(&cpResult2);
}
if (SUCCEEDED(hr)){
//COMMITTEXT SHOULD FORCE ADAPTATION OF MODELS TO CORRECT TEXT
//(THO IT SHOULD BE REDUNDANT WITH SETTRAININGSTATE() ?)
hr = cpResult2-
cpResult2.Release();
}
return hr;
}
int _tmain(int argc, _TCHAR* argv[])
{
HRESULT hr = S_OK;
CComPtr<ISpRecognizer2> cpRecognizer;
CComPtr<ISpRecoContext> cpRecoCtxt;
CComPtr<ISpRecoGrammar> cpGrammar;
CComPtr<ISpRecognizer> cpRecognizerBase;
hr = ::CoInitialize(NULL);
if (SUCCEEDED(hr)) {
hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
}
if (SUCCEEDED(hr)){
hr = cpRecognizer.QueryInterface<ISpRecognizer>(&cpRecognizerBase);
}
if (SUCCEEDED(hr)){
hr = cpRecognizerBase->CreateRecoContext(&cpRecoCtxt);
}
if (cpRecoCtxt){
hr = cpRecoCtxt->CreateGrammar(0, &cpGrammar);
}
if (SUCCEEDED(hr)){
hr = cpGrammar->LoadDictation(NULL, SPLO_STATIC);
}
if (SUCCEEDED(hr)){
hr = cpRecognizer->SetTrainingState(TRUE, TRUE);
}
if (SUCCEEDED(hr)){
hr = cpRecoCtxt->SetNotifyWin32Event();
}
if (SUCCEEDED(hr)){
hr = cpRecoCtxt->SetInterest(
SPFEI(SPEI_RECOGNITION)|
SPFEI(SPEI_HYPOTHESIS)|
SPFEI(SPEI_FALSE_RECOGNITION),
SPFEI(SPEI_RECOGNITION)|
SPFEI(SPEI_HYPOTHESIS)|
SPFEI(SPEI_FALSE_RECOGNITION));
}
if (SUCCEEDED(hr)){
hr = TrainOneFile(cpRecoCtxt, cpRecognizerBase, cpGrammar);
}
if (SUCCEEDED(hr)){//RERUN TO CHECK FOR IMPROVEMENT
hr = TrainOneFile(cpRecoCtxt, cpRecognizerBase, cpGrammar);
}
cpRecognizer->SetTrainingState(FALSE, TRUE);//should turn off and
save changes
::CoUninitialize();
return 0;
}
asking for help on training a profile using speech files rather than
the training UI. He pointed at ISpRecognizer2.SetTrainingState and
ISpTranscript.GetTranscript. That suggestion and the documentation of
the sapi 5.3 provide a starting place but leave a lot of questions
about how to execute a solution, and I’ve been banging (hacking?) away
at this problem for a couple of weeks.
I’ve included code below that seems like it should do the trick, but
it doesn’t. The sapi 5.3 documentation says that
ISpRecognizer2.SetTrainingState(newState,retainTraining) puts the
system into the same state as the training UI. I call
SetTrainingState(TRUE,TRUE), bind an ISpStream to an audio file that
either has a transcript in the file or I add the correct text using
AppendTranscript, then set the recognizer’s input to that stream and
activate recognition. The recognizer performs recognition and makes
mistakes, and according to the documentation it seems like it should
be adapting the audio and language models. But if I send the same file
again immediately, it makes exactly the same mistakes. If after the
first recognition I send SetTrainingState(FALSE,TRUE) to stop training
and save the adaptation, then test it again, it still makes the same
mistakes! So there must be something more.
Another strategy is to use ISpRecoResult2.CommitText, which the sapi
5.3 documentation says will train the acoustic and/or language model
to adapt to the correct text that is given as an argument to that
method. That has the effect of creating 3 new profile files in the
MSASR folder, with the same .dld, _1.ngr, and .tbp suffixes as the
previous files but with the index incremented (e.g. 1033/12/L1033.dld
instead of 1033/11/L1033.dld). The 2 audio model files are unchanged.
But if I send the same audio file again, it still makes exactly the
same mistakes!
Whether I try these two approaches separately or together, they still
do not work.
Rob (or someone), please help! I have done a lot of work and am
sharing the code, so I hope I will get some help about how to make
this work—it’s urgent. I’m hoping it’s a small detail. (P.s.—what is
the index /12/ in the 1033/12/L1033 files?)
Bill Hutchison
#include "stdafx.h"
#include "sphelper.h"
#include <sapi.h>
#include <string.h>
//MAIN() is last function below
inline HRESULT ReturnResult(ISpRecoContext * pRecoCtxt, ISpRecoResult
** ppResult)
{
HRESULT hr = S_OK;
CSpEvent spEvent;
while (S_OK == pRecoCtxt->WaitForNotifyEvent(INFINITE))
{
while (S_OK == spEvent.GetFrom(pRecoCtxt))
{
switch (spEvent.eEventId)
{
case SPEI_RECOGNITION:
*ppResult = spEvent.RecoResult();
if (*ppResult)
{
(*ppResult)->AddRef();
}
return hr;
case [OTHER EVENTS]
spEvent.Clear();
}
return hr;
}
inline HRESULT TrainOneFile(ISpRecoContext * cpRecoCtxt, ISpRecognizer
* cpRecognizerBase, ISpRecoGrammar * cpGrammar)
{
CComPtr<ISpStream> cpStream;
CComPtr<ISpRecoResult> cpResult;
CComPtr<ISpTranscript> cpTranscript;
PWCHAR pwszTranscript;
HRESULT hr = S_OK;
hr = cpStream.CoCreateInstance(CLSID_SpStream);
// Bind a stream to an existing wavefile
if (SUCCEEDED(hr)) {
hr = cpStream->BindToFile(L"C:\\XX.wav", SPFM_OPEN_READONLY,
NULL,
NULL,
SPFEI_ALL_EVENTS);
}
if (SUCCEEDED(hr)){
hr = cpStream.QueryInterface(&cpTranscript);
}
if (SUCCEEDED(hr)) {
hr = cpTranscript->GetTranscript(&pwszTranscript);
}
//THIS IS ALTERNATE CODE FOR PREVIOUS LINE, FOR SOUND FILES THAT
DON’T HAVE A TRANSCRIPT ATTACHED
LPCWSTR sCorrectText = L"Anyone who has spent time on a farm knows
there is a rhythm to the year.";
if (SUCCEEDED(hr)){
hr = cpTranscript->AppendTranscript(s);
}
if (SUCCEEDED(hr)) {
hr = cpTranscript->GetTranscript(&pwszTranscript);
}
if(SUCCEEDED(hr)){
hr = cpRecognizerBase->SetInput(cpStream, TRUE);
}
USES_CONVERSION;
CSpDynamicString dstrText;
if (SUCCEEDED (hr)){
hr = cpGrammar->SetDictationState(SPRS_ACTIVE);
}
if (SUCCEEDED(hr)){
hr = ReturnResult(cpRecoCtxt, &cpResult);
}
if (SUCCEEDED(hr)){
hr = cpGrammar->SetDictationState( SPRS_INACTIVE );
}
if ((cpResult) &&(SUCCEEDED(hr))){
hr = cpResult-
GetText(SP_GETWHOLEPHRASE,SP_GETWHOLEPHRASE,TRUE,&dstrText,NULL);
}CComPtr<ISpRecoResult2> cpResult2;
if (SUCCEEDED(hr)){
hr = cpResult.QueryInterface<ISpRecoResult2>(&cpResult2);
}
if (SUCCEEDED(hr)){
//COMMITTEXT SHOULD FORCE ADAPTATION OF MODELS TO CORRECT TEXT
//(THO IT SHOULD BE REDUNDANT WITH SETTRAININGSTATE() ?)
hr = cpResult2-
CommitText(SP_GETWHOLEPHRASE,SP_GETWHOLEPHRASE,sCorrectText,SPCF_DEFINITE_CORRECTION);
cpResult.Release();cpResult2.Release();
}
return hr;
}
int _tmain(int argc, _TCHAR* argv[])
{
HRESULT hr = S_OK;
CComPtr<ISpRecognizer2> cpRecognizer;
CComPtr<ISpRecoContext> cpRecoCtxt;
CComPtr<ISpRecoGrammar> cpGrammar;
CComPtr<ISpRecognizer> cpRecognizerBase;
hr = ::CoInitialize(NULL);
if (SUCCEEDED(hr)) {
hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer);
}
if (SUCCEEDED(hr)){
hr = cpRecognizer.QueryInterface<ISpRecognizer>(&cpRecognizerBase);
}
if (SUCCEEDED(hr)){
hr = cpRecognizerBase->CreateRecoContext(&cpRecoCtxt);
}
if (cpRecoCtxt){
hr = cpRecoCtxt->CreateGrammar(0, &cpGrammar);
}
if (SUCCEEDED(hr)){
hr = cpGrammar->LoadDictation(NULL, SPLO_STATIC);
}
if (SUCCEEDED(hr)){
hr = cpRecognizer->SetTrainingState(TRUE, TRUE);
}
if (SUCCEEDED(hr)){
hr = cpRecoCtxt->SetNotifyWin32Event();
}
if (SUCCEEDED(hr)){
hr = cpRecoCtxt->SetInterest(
SPFEI(SPEI_RECOGNITION)|
SPFEI(SPEI_HYPOTHESIS)|
SPFEI(SPEI_FALSE_RECOGNITION),
SPFEI(SPEI_RECOGNITION)|
SPFEI(SPEI_HYPOTHESIS)|
SPFEI(SPEI_FALSE_RECOGNITION));
}
if (SUCCEEDED(hr)){
hr = TrainOneFile(cpRecoCtxt, cpRecognizerBase, cpGrammar);
}
if (SUCCEEDED(hr)){//RERUN TO CHECK FOR IMPROVEMENT
hr = TrainOneFile(cpRecoCtxt, cpRecognizerBase, cpGrammar);
}
cpRecognizer->SetTrainingState(FALSE, TRUE);//should turn off and
save changes
::CoUninitialize();
return 0;
}