LLM Code Documentation

This script automates the generation of AI-powered documentation for Docker services. Key features include:

  • Uses SearxNG API to gather context about services from multiple sources
  • Scrapes project websites and GitHub READMEs for additional context
  • Processes content through Ollama LLM models (Qwen 2.5 and Llama 3.1)
  • Generates structured markdown documentation with sections for:
    • Introduction
    • Uses and Benefits
    • Docker Setup
    • Security Essentials
  • Handles frontmatter management including metadata, tags, and performance metrics
  • Attempts but fails miserably to maintain consistent formatting and style across all generated content
import fs from 'fs';
import matter from 'gray-matter';
import axios from 'axios';
import * as cheerio from 'cheerio';

const SEARXNG_BASE_QUERY = `http://localhost:8080/search?&lang=en&safesearch=2&format=json&q=`;

const INPUT_DIRECTORY = '../src/content/services';
const OUTPUT_DIRECTORY = `../src/content/llm`;


const searchCategoryShebang = '!it'
const OLLAMA_MODELS = ['qwen2.5:latest', 'llama3.1:latest'];
const OPTIONS = {
    temperature: 0.5, seed: 1, max_tokens: 3000, top_p: 0.9, frequency_penalty: 0.2, presence_penalty: 0.0
};


const PROMPT_SECTIONS = {
    'Introduction': `Briefly introduce **[Service Name]**, highlighting features and benefits.`,
    'Uses and Benefits': `Explain the primary use cases and benefits of **[Service Name]** in Docker.`,
    'Docker Setup': `Provide a quick setup guide for **[Service Name]** in Docker, including tips for common issues.`,
    'Security Essentials': `Outline essential security measures for **[Service Name]** in Docker.`,
};

const PRE_PROMPT = `
Create a short, concise, beginner-friendly article subsection for **[Service Name]** in Docker.
Guidelines: Provide your response in markdown format, ensuring code snippets are formatted correctly.
`;

const POST_PROMPT = `
Output Requirements: Keep language clear, concise, and use Markdown format throughout.
Respond solely with the information requested, do not include any urls, do not repeat the heading provided to you.
`;


async function generateAIText(model, dockerImage, prompt, llmContext, options) {
    const ollamaUrl = 'http://localhost:11434/api/generate';
    const enhancedPrompt = `${prompt.replace('[Service Name]', dockerImage)}\n\nAdditional Context:\n${llmContext}`;

    try {
        console.log(`Generating content for ${dockerImage} using model ${model}\nPrompt:\n${enhancedPrompt}`);
        const response = await axios.post(ollamaUrl, {
            model: model, prompt: enhancedPrompt, ...options, stream: false
        });

        const generatedContent = response.data.response;
        console.log('Generated content:', generatedContent);
        console.log('__________________________________________________________________________________________');
        return generatedContent || null;

    } catch (error) {
        console.error('Error generating content:', error);
        return null;
    }
}

async function generateGuide(model, dockerImage, context, options) {
    console.log(`Generating content for ${dockerImage} using model ${model}\ncontext:\n${context}`);
    let guideContent = '';

    for (const [section, prompt] of Object.entries(PROMPT_SECTIONS)) {
        const customizedPrompt = `${PRE_PROMPT} Section: ${section}\n${prompt.replace('[Service Name]', dockerImage)}\n${POST_PROMPT}`;
        const sectionContent = await generateAIText(model, dockerImage, customizedPrompt, context, options);

        if (sectionContent) {
            guideContent += `\n${sectionContent}`;
            console.log(`Generated content for section: ${section}`);
            console.log('__________________________________________________________________________________________');
        } else {
            console.error(`Failed to generate content for section: ${section}`);
        }
        console.log('__________________________________________________________________________________________');
    }
    return guideContent;
}


async function searchSearxNG(searchShebang, query) {
    const queryString = encodeURI(`${SEARXNG_BASE_QUERY}${searchShebang} ${query}`);

    try {
        const response = await axios.post(queryString);
        console.log('Search results:', response.data.results[0]);
        return response.data.results.filter(result => result.score > 0.5);
    } catch (error) {
        console.error('Error fetching data:', error);
        return [];
    }
}

// Function to read frontmatter and extract the dockerImage value
function getFrontMatterValue(inputFilePath, frontMatterKey) {
    const fileContent = fs.readFileSync(inputFilePath, 'utf-8');
    const frontmatter = matter(fileContent);
    return frontmatter.data[`${frontMatterKey}`] || null;
}

// Function to scrape content from a URL
async function scrapeMainWebsiteHomepage(url) {
    try {
        const response = await axios.get(url);
        const $ = cheerio.load(response.data);
        $('script').remove();
        $('style').remove();

        const cleanedPageContent = $('body').text().toString().trim().replace(/\s+|\n/g, ' ');
        console.log('cleanedPageContent:', cleanedPageContent);
        return cleanedPageContent;
    } catch (error) {
        console.error('Error scraping content:', error);
        return '';
    }
}

// Function to scrape GitHub README and Docker Hub details for further context
async function scrapeGitHubReadme(githubUrl) {
    try {
        return await axios.get(githubUrl).then(response => {
            const $ = cheerio.load(response.data);

            const readmeContent = [];

            // Iterate over each element in the markdown body
            $('.markdown-body').contents().each((index, element) => {
                const $element = $(element);
                // Ignore any elements with images sourced from avatars.githubusercontent.com
                if ($element.find('img').attr('src')?.includes('avatars.githubusercontent.com')) {
                    return; // Skip to the next element
                }
                // Append text content if not breaking on cutoff word
                readmeContent.push($element.text().trim());
            });
            // Join accumulated content into a single string, removing excess whitespace
            return readmeContent.join(' ').replace(/\s+/g, ' ').trim();
        });
    } catch (error) {
        console.error('Error scraping GitHub README:', error);
        return null
    }
}

function generateTagsFromSearXNGResults(searchResults) {
    console.log('generateTagsFromSearXNGResults.searchResults:', searchResults);
    return searchResults.reduce((acc, result) => {
        if (result.url.includes('hub.docker') || result.url.includes('github.com')) {
            acc.push(...result.tags);
        }
        return acc;
    }, []);
}

function generateFrontMatterDescription(searchResults) {
    console.log('generateFrontMatterDescription.searchResults:', searchResults);
    return searchResults.map(result => {
        if (result.url.includes('hub.docker.com') || result.url.includes('github.com')) {
            return result.content;
        }
    }).filter(Boolean);
}


const sanitiseModelNameForUseInFileName = (modelName) => modelName.replace(':', '_').replace(' ', '_').replace('.', '-').toLowerCase();


async function fetchServiceContext(projectUrl, title) {
    try {
        const homepageContent = await scrapeMainWebsiteHomepage(projectUrl)
        const searchResults = await searchSearxNG(searchCategoryShebang, title)
        if (!searchResults.length) {
            console.error('No search results found for:', title);
            return {tags: [], description: '', context: 'No searchResults available.'};
        }
        const tags = generateTagsFromSearXNGResults(searchResults);

        if (!tags.length) {
            console.error('No tags found for:', title);
            return {tags: [], description: '', context: 'No Tags available.'};
        }

        const description = generateFrontMatterDescription(searchResults);
        const githubUrl = searchResults.find(result => result.url.includes('github.com'))?.url;
        const githubReadme = githubUrl ? await scrapeGitHubReadme(githubUrl) : null;

        let context = '';
        if (homepageContent && githubReadme) {
            context = `App website:\n${homepageContent}\nApp GitHub README: ${githubReadme}`;
        } else if (homepageContent) {
            context = `App website:\n${homepageContent}`;

        } else if (githubReadme) {
            context = `App GitHub README: ${githubReadme}`;
        } else {
            context = 'No additional context available.';
        }

        return {tags, description, context};
    } catch (error) {
        console.error('Error fetching service context:', error);
        return {tags: [], description: '', context: 'No context available.'};
    }
}

// Main function to generate markdown content for each service
const main = async () => {
    const files = fs.readdirSync(INPUT_DIRECTORY);

    for (const model of OLLAMA_MODELS) {
        const sanitizedModelName = sanitiseModelNameForUseInFileName(model);
        const modelOutputDir = `${OUTPUT_DIRECTORY}/${sanitizedModelName}`;
        if (!fs.existsSync(modelOutputDir)) fs.mkdirSync(modelOutputDir, {recursive: true});

        for (const file of files) {

            const inputFilePath = `${INPUT_DIRECTORY}/${file}`;
            const outputFilePath = `${modelOutputDir}/${file}`;
            const fileContent = fs.readFileSync(inputFilePath, 'utf-8');
            const frontmatter = matter(fileContent);
            const dockerImage = getFrontMatterValue(inputFilePath, 'dockerImage');
            const iconName = getFrontMatterValue(inputFilePath, 'iconName');
            if (iconName === 'default') {
                console.error(`Skipping ${dockerImage} as it has default icon.`);
                continue;
            }
            if (!dockerImage) {
                console.error(`No dockerImage found for ${file}`);
                continue;
            }


            const {
                tags, description, context
            } = await fetchServiceContext(frontmatter.data.projectUrl, frontmatter.data.title);

            const startTime = Date.now();
            const articleContent = await generateGuide(model, dockerImage, context, OPTIONS);

            const generationTime = (Date.now() - startTime) / 1000;
            if (!articleContent) {
                console.error(`Failed to generate content for ${dockerImage} with ${model}`);
                continue;
            }

            const newFrontmatter = {
                ...frontmatter.data,
                description: description,
                tags: tags,
                model: model,
                isAI: true,
                generationTime: generationTime
            };

            const finalContent = matter.stringify('', newFrontmatter) + articleContent;
            fs.writeFileSync(outputFilePath, finalContent, 'utf-8');

            console.log(`Generated and saved content for ${dockerImage} with ${model} in ${generationTime}s.`);
        }
    }
};

main().then(r => console.log('Done!')).catch(console.error);