Copied to clipboard

Flag this post as spam?

This post will be reported to the moderators as potential spam to be looked at


  • Dan Diplo 1543 posts 6108 karma points MVP 4x c-trib
    Apr 29, 2020 @ 09:54
    Dan Diplo
    0

    How Examine Indexes HTML (Spacing issue)

    I've noticed when conducting an Examine search that words that appear in the main rich-text content weren't being matched in some instances. When I looked at the raw index I could see that Examine has stripped out HTML tags (which makes sense) but where this has happened it hasn't left any whitespace in their place. This causes adjacent words to run into each other in the index.

    To illustrate, take this original HTML content in the RTE:

    <p><b>Knowledge, Skills &amp; Abilities</b></p><ul><li><p>Excellent attention to detail, and ability to create wire-frames, user flows, mock-ups, interactive design and prototypes.</p></li><li><p>Excellent understanding of information architecture.</p></li><li></ul>
    

    When Examine indexes it, it is converted to plain-text like this:

    Knowledge, Skills &amp; AbilitiesExcellent attention to detail, and ability to create wire-frames, user flows, mock-ups, interactive design and prototypes.Excellent understanding of information architecture.
    

    You can see in the Examine index that where tags were removed that words run together, such as "AbilitiesExcellent" or "prototypes.Excellent" and this then messes up matching if you search for, say, "prototypes".

    So is there a way of getting it to replace tags with whitespace when indexing, to preserve words?

  • Lars-Erik Aabech 349 posts 1098 karma points MVP 5x c-trib
    Apr 29, 2020 @ 18:38
    Lars-Erik Aabech
    1

    You could choose another analyzer/tokenizer for the fields in question.
    See example number two under Overriding index creation here:
    https://our.umbraco.com/documentation/Reference/Searching/Examine/indexing/

    I had the docs open and have just written a thing to index paths in parts, so it's was a bit lucky I can just gingerly paste this example. 😆

    public class IndexComposer : ComponentComposer<IndexComponent>
    {
    }
    
    public class IndexComponent : IComponent
    {
        private readonly IExamineManager examineManager;
    
        public IndexComponent(IExamineManager examineManager)
        {
            this.examineManager = examineManager;
        }
    
        public void Initialize()
        {
            if (!examineManager.TryGetIndex(Constants.UmbracoIndexes.ExternalIndexName, out var index))
            {
                return;
            }
    
            var coll = ((Examine.LuceneEngine.Providers.LuceneIndex) index).FieldValueTypeCollection;
    
            index.FieldDefinitionCollection.AddOrUpdate(new FieldDefinition("path", "csv"));
            coll.ValueTypeFactories.AddOrUpdate("csv", new DelegateFieldValueTypeFactory(s => new FullTextType(s, new WhitespaceAndCommaAnalyzer())));
    
            //index.FieldDefinitionCollection.
        }
    
        public void Terminate()
        {
        }
    }
    
    public class WhitespaceAndCommaAnalyzer : Analyzer
    {
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            return (TokenStream)new WhitespaceAndCommaTokenizer(reader);
        }
    
        public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
        {
            Tokenizer tokenizer = (Tokenizer)this.PreviousTokenStream;
            if (tokenizer == null)
            {
                tokenizer = (Tokenizer)new WhitespaceAndCommaTokenizer(reader);
                this.PreviousTokenStream = (object)tokenizer;
            }
            else
                tokenizer.Reset(reader);
            return (TokenStream)tokenizer;
        }
    }
    
    public class WhitespaceAndCommaTokenizer : WhitespaceTokenizer
    {
        public WhitespaceAndCommaTokenizer(TextReader @in) : base(@in)
        {
        }
    
        public WhitespaceAndCommaTokenizer(AttributeSource source, TextReader @in) : base(source, @in)
        {
        }
    
        public WhitespaceAndCommaTokenizer(AttributeFactory factory, TextReader @in) : base(factory, @in)
        {
        }
    
        protected override bool IsTokenChar(char c)
        {
            return base.IsTokenChar(c) && !','.Equals(c);
        }
    }
    
  • Lars-Erik Aabech 349 posts 1098 karma points MVP 5x c-trib
    Apr 29, 2020 @ 18:41
    Lars-Erik Aabech
    0

    You could choose another analyzer/tokenizer for the fields in question.
    See example number two under Overriding index creation here:
    https://our.umbraco.com/documentation/Reference/Searching/Examine/indexing/

    I had the docs open and have just written a thing to index paths in parts, so it's was a bit lucky I can just gingerly paste this example. 😆

    public class IndexComposer : ComponentComposer<IndexComponent>
    {
    }
    
    public class IndexComponent : IComponent
    {
        private readonly IExamineManager examineManager;
    
        public IndexComponent(IExamineManager examineManager)
        {
            this.examineManager = examineManager;
        }
    
        public void Initialize()
        {
            if (!examineManager.TryGetIndex(Constants.UmbracoIndexes.ExternalIndexName, out var index))
            {
                return;
            }
    
            var coll = ((Examine.LuceneEngine.Providers.LuceneIndex) index).FieldValueTypeCollection;
    
            index.FieldDefinitionCollection.AddOrUpdate(new FieldDefinition("path", "csv"));
            coll.ValueTypeFactories.AddOrUpdate("csv", new DelegateFieldValueTypeFactory(s => new FullTextType(s, new WhitespaceAndCommaAnalyzer())));
    
        }
    
        public void Terminate()
        {
        }
    }
    
    public class WhitespaceAndCommaAnalyzer : Analyzer
    {
        public override TokenStream TokenStream(string fieldName, TextReader reader)
        {
            return (TokenStream)new WhitespaceAndCommaTokenizer(reader);
        }
    
        public override TokenStream ReusableTokenStream(string fieldName, TextReader reader)
        {
            Tokenizer tokenizer = (Tokenizer)this.PreviousTokenStream;
            if (tokenizer == null)
            {
                tokenizer = (Tokenizer)new WhitespaceAndCommaTokenizer(reader);
                this.PreviousTokenStream = (object)tokenizer;
            }
            else
                tokenizer.Reset(reader);
            return (TokenStream)tokenizer;
        }
    }
    
    public class WhitespaceAndCommaTokenizer : WhitespaceTokenizer
    {
        public WhitespaceAndCommaTokenizer(TextReader @in) : base(@in)
        {
        }
    
        public WhitespaceAndCommaTokenizer(AttributeSource source, TextReader @in) : base(source, @in)
        {
        }
    
        public WhitespaceAndCommaTokenizer(AttributeFactory factory, TextReader @in) : base(factory, @in)
        {
        }
    
        protected override bool IsTokenChar(char c)
        {
            return base.IsTokenChar(c) && !','.Equals(c);
        }
    }
    
Please Sign in or register to post replies

Write your reply to:

Draft