Making a self hosted link summariser using LLMs

If you use Reddit or Lemmy you are likely to be familiar with those bots which summarise news links and comment the result. I decided to make one of these which uses an LLM to do the summarisation

Step 1: Get the website’s text

For this you want to get the page’s HTML, and remove anything that won’t help the bot such as javascript, CSS, images or videos. You’ll also want to get rid of excessive spaces or newlines as they’ll take up tokens that could be used for more words


func standardizeSpaces(s string) string {
	return strings.Join(strings.Fields(s), " ")
}

func scrapeSite(url string) (string, error) {
	resp, err := http.Get(url)
	if err != nil {
		slog.Error("Failed to get site", "site", url, "error", err)
		return "", err
	}
	defer resp.Body.Close()

	doc, err := goquery.NewDocumentFromReader(resp.Body)

	if err != nil {
		slog.Error("Failed to get site body", "site", url, "error", err)
		return "", err
	}

	doc.Find("img").Each(func(i int, el *goquery.Selection) {
		el.Remove()
	})
	// We don't care about images, we only want the text on the site
	doc.Find("picture").Each(func(i int, el *goquery.Selection) {
		el.Remove()
	})
	// We don't care about scripts, we only want the text on the site
	doc.Find("script").Each(func(i int, el *goquery.Selection) {
		el.Remove()
	})
	// We don't care about css, we only want the text on the site
	doc.Find("style").Each(func(i int, el *goquery.Selection) {
		el.Remove()
	})
	// We don't care about videos, we only want the text on the site
	doc.Find("video").Each(func(i int, el *goquery.Selection) {
		el.Remove()
	})

	// There's unlikely to be anything we want in these
	doc.Find("noscript").Each(func(i int, el *goquery.Selection) {
		el.Remove()
	})

	// remove excess newlines etc
	bdyText := standardizeSpaces(doc.Text())
	slog.Info(bdyText)
	return bdyText, err
}

Step 2: Use the LLM to summarise the text

For this you’ll need to truncate the content from the site if it’s too long, and send it over to the LLM, with a prompt and “thought” that it should do this.


func doSummarisation(token string, maxTokens int, server string, model string, siteText string) (summary string, err error) {

	lengthOfDataToSend := len(siteText)

	// Just a rule of thumb
	likelyMaximumLengthAccepted := 2 * maxTokens
	if len(siteText) > 2*likelyMaximumLengthAccepted {
		lengthOfDataToSend = 2 * likelyMaximumLengthAccepted
	}

	aiClient := openai.DefaultConfig(token)
	aiClient.BaseURL = server
	client := openai.NewClientWithConfig(aiClient)

	stream, err := client.CreateChatCompletionStream(
		context.Background(),
		openai.ChatCompletionRequest{
			Model: model,
			Messages: []openai.ChatCompletionMessage{
				{Role: openai.ChatMessageRoleSystem,
					Content: "You are an expert at summarising text in a concise manner for intelligent social media users.",
				},
				{
					Role: openai.ChatMessageRoleUser,
					Content: fmt.Sprintf("Summarise the following text using the fewest possible words\n, %s",
						siteText[:lengthOfDataToSend-1]),
				},
			},
			Stream:           true,
			Stop:             []string{"<|end_of_text|>", "<|eot_id|>"},
			Temperature:      0.7,
			TopP:             0.95,
			MaxTokens:        maxTokens,
			FrequencyPenalty: 0.0,
			PresencePenalty:  0.0,
		},
	)

	if err != nil {
		slog.Error("Error with ChatCompletionStream for summariser", "error", err)
		return "", err
	}
	defer stream.Close()

	var summaryBuilder strings.Builder
	for {
		var response openai.ChatCompletionStreamResponse
		response, err = stream.Recv()
		if errors.Is(err, io.EOF) {
			slog.Debug("Finished receiving response from LLM")
			return summaryBuilder.String(), nil
		}

		if err != nil {
			slog.Error("Error with ChatCompletionStream for summariser", "error", err)
			return summaryBuilder.String(), err
		}
		_, err = summaryBuilder.WriteString(response.Choices[0].Delta.Content)

		if err != nil {
			slog.Error("Error with writing stream to summary", "error", err)
			return summaryBuilder.String(), err
		}

	}
	// Never get here

}

The code

Full version of this code available at https://github.com/RichardoC/personal-webpage-summariser

Better versions exist such as https://github.com/RikudouSage/LemmyAutoTldrBot


Copyright © 2024 Richard Finlay Tweed. All rights reserved. All views expressed are my own