Setup "dataspects DSKMF Standard Search Settings, Mappings, Indexings and Queries"

From SMW CindyKate - Main
Component2262247671
Jump to: navigation, search

Content


IsCarriedOutBy SearchEngineer

Index Settings

{
  settings: {
    index: {
      number_of_shards: 1,
      number_of_replicas: 1
    },
    analysis: {
      filter: {
        my_shingle_2: {
          type: 'shingle',
          output_unigrams: false
        }
      },
      analyzer: {
        my_lowercase_analyzer: {
          type: 'custom',
          tokenizer: 'standard',
          filter: 'lowercase'
        },
        my_ngram_tokenizer_analyzer: {
          type: 'custom',
          tokenizer: 'my_ngram_tokenizer',
          filter: 'lowercase'
        },
        my_completion_analyzer: {
          tokenizer: 'standard',
          filter: [
            'standard',
            'lowercase',
            'my_shingle_2'
          ]
        }
      },
      tokenizer: {
        my_ngram_tokenizer: {
          type: 'ngram',
          min_gram: 3,
          max_gram: 20,
          token_chars: [
            'letter',
            'digit',
            'punctuation'
          ]
        }
      },
      normalizer: {
        my_normalizer: {
          type: "custom",
          char_filter: [],
          filter: ["lowercase", "asciifolding"]
        }
      }
    }
  }
}

Index Mappings

{
  _doc: {
    _source: {
      enabled: true
    },
    properties: {
      # Resource silo level
      OriginatedFromResourceSiloID: {
        type: 'keyword',
        index: false,
        store: false
      },
      OriginatedFromResourceSiloLabel: {
        type: 'keyword',
        index: true,
        store: false
      },
      # Resource level
      HasResourceName: {
        type: 'keyword',
        index: true,
        store: false
      },
      HasResourceURL: {
        type: 'keyword',
        index: true,
        store: false
      },
      HasResourceType: {
        type: 'keyword',
        index: true,
        store: false
      },
      # Entity/subject level
      HasEntityClass: {
        type: 'keyword',
        index: false,
        store: false
      },
      HasEntityType: {
        type: 'keyword',
        store: true,
        index: true,
        fields: {
          text: {
            type: 'text',
            analyzer: 'my_ngram_tokenizer_analyzer',
            index: true,
            store: false
          }
        }
      },
      HasEntityName: {
        type: 'text',
        store: false
      },
      HasEntityTitle: {
        type: 'text',
        analyzer: 'my_ngram_tokenizer_analyzer',
        term_vector: 'with_positions_offsets_payloads',
        store: false
      },
      HasEntityTypeAndEntityTitle: {
        type: 'text',
        analyzer: 'my_ngram_tokenizer_analyzer',
        term_vector: 'with_positions_offsets_payloads',
        store: false,
        copy_to: ['completion']
      },
      HasEntityKeywords: {
        type: 'text',
        index: true,
        analyzer: 'my_ngram_tokenizer_analyzer',
        term_vector: 'with_positions_offsets_payloads',
        copy_to: ['completion']
      },
      completion: {
	      fielddata: true,
        type: 'text',
        analyzer: 'my_completion_analyzer'
      },
      HasEntityBlurb: {
        type: 'text',
        analyzer: 'my_ngram_tokenizer_analyzer',
        term_vector: 'with_positions_offsets_payloads',
        store: false,
        copy_to: ['completion']
      },
      HasEntityContent: {
        type: 'text',
        analyzer: 'my_ngram_tokenizer_analyzer',
        term_vector: 'with_positions_offsets_payloads',
        store: false,
        copy_to: ['completion']
      },
      # Entity properties
      HasEntityAnnotations: {
        type: 'nested',
        include_in_root: true,
        properties: {
          HasAnnotationSubject: {
            type: 'keyword',
            index: false,
            store: false
          },
          HasAnnotationPredicate: {
            type: 'keyword',
            index: true,
            store: true,
            fields: {
              myNormalized: {
                type: 'text',
                analyzer: 'my_lowercase_analyzer',
                index: true,
                store: false
              }
            }
          },
          HasAnnotationPredicateURL: {
            type: 'text',
            index: true,
            store: false
          },
          HasAnnotationObjectURL: {
            type: 'keyword',
            index: false,
            store: false
          },
          HasAnnotationObjectHTMLATag: {
            type: 'keyword',
            index: false,
            store: false
          },
          HasAnnotationObjectValue: {
            type: 'text',
            store: true,
            index: true,
            analyzer: 'my_ngram_tokenizer_analyzer',
            term_vector: 'with_positions_offsets_payloads',
            copy_to: ['HasEntityAnnotations.HasAnnotationObjectNotAnalyzedValue'],
            fields: {
              myNormalized: {
                type: 'keyword',
                normalizer: 'my_normalizer',
                index: true,
                store: false
              }
            }
          },
          HasAnnotationObjectNotAnalyzedValue: {
            type: 'keyword',
            store: true,
            index: true
          }
        }
      }
    }
  }
}

Query Design

{
  size: 50,
  query: {
    bool:{
      should: [
        {
          match: {
            HasEntityTypeAndEntityTitle: {
              query: sQueryTerm,
              analyzer: 'my_lowercase_analyzer',
              boost: '10'
            }
          }
        },
        {
          match: {
            HasEntityBlurb: {
              query: sQueryTerm,
              analyzer: 'my_lowercase_analyzer',
              boost: '5'
            }
          }
        },
        {
          match: {
            HasEntityKeywords: {
              query: sQueryTerm,
              analyzer: 'my_lowercase_analyzer',
              boost: 10
            }
          }
        },
        {
          match: {
            HasEntityContent: {
              query: sQueryTerm,
              analyzer: 'my_lowercase_analyzer'
            }
          }
        },
        {
          nested: {
            path: 'HasEntityAnnotations',
            query: {
              bool: {
                should: [
                  {
                    wildcard: {
                      'HasEntityAnnotations.HasAnnotationPredicate.myNormalized': {
                        value: "*#{sQueryTerm}*",
                        boost: 10
                      }
                    }
                  },
                  {
                    match: {
                      'HasEntityAnnotations.HasAnnotationObjectValue.myNormalized': {
                        query: sQueryTerm
                      }
                    }
                  }
                ]
              }
            },
            inner_hits: {
              highlight: {
                fields: {
                  'HasEntityAnnotations.HasAnnotationPredicate.myNormalized': {
                    number_of_fragments: 1,
                    no_match_size: 1000
                  }
                },
                pre_tags: ['<span class="highlight">'],
                post_tags: ['</span>']
              }
            }
          }
        }
      ]
    }
  },
  highlight: {
    fields: {
      # Highlighting for HasEntityAnnotations.HasAnnotationObjects.subjectPropertyEnglishValue
      # and HasEntityAnnotations.HasAnnotationObjects.subjectPropertyGermanValue needs to be
      # implemented in ElasticsearchHit.
      HasEntityTypeAndEntityTitle: {
        fragment_size: 0,
        number_of_fragments: 1,
        no_match_size: 1000
      },
      HasEntityKeywords: {
        fragment_size: 0,
        number_of_fragments: 1,
        no_match_size: 200
      },
      HasEntityBlurb: {
        fragment_size: 2000,
        number_of_fragments: 1,
        no_match_size: 2000
      },
      HasEntityContent: {
        fragment_size: 1000,
        number_of_fragments: 1,
        no_match_size: 500
      }
    },
    pre_tags: ['<span class="highlight">'],
    post_tags: ['</span>']
  }
}