bb58c12306
Co-authored-by: aider (gemini/gemini-2.5-pro) <aider@aider.chat>
178 lines
5.5 KiB
TypeScript
178 lines
5.5 KiB
TypeScript
import { excerptAfter, excerptBefore, type SearchMatch } from '../globals'
|
|
import { removeDiacritics, warnVerbose } from './utils'
|
|
import type { Query } from '../search/query'
|
|
import { Notice } from 'obsidian'
|
|
import { escapeRegExp } from 'lodash-es'
|
|
import type OmnisearchPlugin from '../main'
|
|
|
|
export class TextProcessor {
|
|
constructor(private plugin: OmnisearchPlugin) {}
|
|
|
|
/**
|
|
* Wraps the matches in the text with a <span> element and a highlight class
|
|
* @param text
|
|
* @param matches
|
|
* @returns The html string with the matches highlighted
|
|
*/
|
|
public highlightText(text: string, matches: SearchMatch[]): string {
|
|
const highlightClass = `suggestion-highlight omnisearch-highlight ${
|
|
this.plugin.settings.highlight ? 'omnisearch-default-highlight' : ''
|
|
}`
|
|
|
|
if (!matches.length) {
|
|
return text
|
|
}
|
|
try {
|
|
return text.replace(
|
|
new RegExp(
|
|
`(${matches
|
|
.map(item => escapeRegExp(escapeHTML(item.match)))
|
|
.join('|')})`,
|
|
'giu'
|
|
),
|
|
`<span class="${highlightClass}">$1</span>`
|
|
)
|
|
} catch (e) {
|
|
console.error('Omnisearch - Error in highlightText()', e)
|
|
return text
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Converts a list of strings to a list of words, using the \b word boundary.
|
|
* Used to find excerpts in a note body, or select which words to highlight.
|
|
*/
|
|
public stringsToRegex(strings: string[]): RegExp {
|
|
if (!strings.length) return /^$/g
|
|
|
|
// sort strings by decreasing length, so that longer strings are matched first
|
|
strings.sort((a, b) => b.length - a.length)
|
|
|
|
const joined = `(${strings
|
|
.map(s => `(?<!\\w)${escapeRegExp(s)}(?!\\w)`)
|
|
.join('|')})`
|
|
|
|
return new RegExp(`${joined}`, 'gui')
|
|
}
|
|
|
|
/**
|
|
* Returns an array of matches in the text, using the provided regex
|
|
* @param text
|
|
* @param reg
|
|
* @param query
|
|
*/
|
|
public getMatches(
|
|
text: string,
|
|
words: string[],
|
|
query?: Query
|
|
): SearchMatch[] {
|
|
const reg = this.stringsToRegex(words)
|
|
const originalText = text
|
|
// text = text.toLowerCase().replace(new RegExp(SEPARATORS, 'gu'), ' ')
|
|
if (this.plugin.settings.ignoreDiacritics) {
|
|
text = removeDiacritics(text, this.plugin.settings.ignoreArabicDiacritics)
|
|
}
|
|
const startTime = new Date().getTime()
|
|
let match: RegExpExecArray | null = null
|
|
let matches: SearchMatch[] = []
|
|
let count = 0
|
|
while ((match = reg.exec(text)) !== null) {
|
|
// Avoid infinite loops, stop looking after 100 matches or if we're taking too much time
|
|
if (++count >= 100 || new Date().getTime() - startTime > 50) {
|
|
warnVerbose('Stopped getMatches at', count, 'results')
|
|
break
|
|
}
|
|
const matchStartIndex = match.index
|
|
const matchEndIndex = matchStartIndex + match[0].length
|
|
|
|
// If `ignoreDiacritics` is on, `text` may have a different length than `originalText`,
|
|
// making `match.index` unreliable for `originalText`.
|
|
// We use `match[0]`, which is the matched term (but without diacritics).
|
|
const originalMatchBeforeTrim = this.plugin.settings.ignoreDiacritics
|
|
? match[0]
|
|
: originalText.substring(matchStartIndex, matchEndIndex)
|
|
|
|
const originalMatch = originalMatchBeforeTrim.trim()
|
|
|
|
if (originalMatch && match.index >= 0) {
|
|
matches.push({ match: originalMatch, offset: match.index })
|
|
}
|
|
}
|
|
|
|
// If the query is more than 1 token and can be found "as is" in the text, put this match first
|
|
if (
|
|
query &&
|
|
(query.query.text.length > 1 || query.getExactTerms().length > 0)
|
|
) {
|
|
const bestMatchStr = query.getBestStringForExcerpt()
|
|
const best = text.toLowerCase().indexOf(bestMatchStr)
|
|
if (best > -1) {
|
|
// We found the full query. We make it the first result, and remove any other match that it contains.
|
|
matches = matches.filter(
|
|
m => m.offset < best || m.offset >= best + bestMatchStr.length
|
|
)
|
|
matches.unshift({
|
|
offset: best,
|
|
match: originalText.substring(best, best + bestMatchStr.length),
|
|
})
|
|
}
|
|
}
|
|
return matches
|
|
}
|
|
|
|
public makeExcerpt(content: string, offset: number): string {
|
|
const settings = this.plugin.settings
|
|
try {
|
|
const pos = offset ?? -1
|
|
const from = Math.max(0, pos - excerptBefore)
|
|
const to = Math.min(content.length, pos + excerptAfter)
|
|
if (pos > -1) {
|
|
content =
|
|
(from > 0 ? '…' : '') +
|
|
content.slice(from, to).trim() +
|
|
(to < content.length - 1 ? '…' : '')
|
|
} else {
|
|
content = content.slice(0, excerptAfter)
|
|
}
|
|
if (settings.renderLineReturnInExcerpts) {
|
|
const last = content.lastIndexOf('\n', pos - from)
|
|
|
|
if (last > 0) {
|
|
content = content.slice(last)
|
|
}
|
|
|
|
const lineReturn = new RegExp(/(?:\r\n|\r|\n)/g)
|
|
// Remove multiple line returns
|
|
content = content
|
|
.split(lineReturn)
|
|
.filter(l => l)
|
|
.join('\n')
|
|
}
|
|
|
|
content = escapeHTML(content)
|
|
|
|
if (settings.renderLineReturnInExcerpts) {
|
|
content = content.trim().replaceAll('\n', '<br>')
|
|
}
|
|
|
|
return content
|
|
} catch (e) {
|
|
new Notice(
|
|
'Omnisearch - Error while creating excerpt, see developer console'
|
|
)
|
|
console.error(`Omnisearch - Error while creating excerpt`)
|
|
console.error(e)
|
|
return ''
|
|
}
|
|
}
|
|
}
|
|
|
|
export function escapeHTML(html: string): string {
|
|
return html
|
|
.replaceAll('&', '&')
|
|
.replaceAll('<', '<')
|
|
.replaceAll('>', '>')
|
|
.replaceAll('"', '"')
|
|
.replaceAll("'", ''')
|
|
}
|