phpBB to LLM

phpBB to LLM

Postby Antonio Linares » Thu Dec 21, 2023 9:57 am

1. Create a SQL dump file from your phpBB forums:

mysqldump --user=username --password=YourPassword database_phpbb > dumpfile.sql

2. We "clean" the dump file using this code:
Code: Select all  Expand view  RUN
#include "FiveWin.ch"

function Main()

    local cSQL := hb_memoRead( "dumpfile.sql" )
    local hHTMLCodes := { ;
        """ => '"',;
        "#" => '#',;
        "$" => '$',;
        "%" => '%',;
        "&" => '&',;
        "'" => "'",;
        "(" => '(',;
        ")" => ')',;
        "*" => '*',;
        "+" => '+',;
        "," => ',',;
        "&hyphen-minus;" => '-',;
        "." => '.',;
        "/" => '/',;
        ":" => ':',;
        ";" => ';',;
        "&lt;" => '<',;
        "&equals;" => '=',;
        "&gt;" => '>',;
        "&quest;" => '?',;
        "&commat;" => '@',;
        "&lsqb;" => '[',;
        "&bsol;" => '\\',;
        "&rsqb;" => ']',;
        "&Hat;" => '^',;
        "&lowbar;" => '_',;
        "&grave;" => '`',;
        "&lcub;" => '{',;
        "&verbar;" => '|',;
        "&rcub;" => '}',;
        "~" => '~' }

    hb_memoWrit( "forums.sql", hb_strReplace( cSQL, hHTMLCodes ) )

return nil    

3. Create a local mysql "forums" database and using Heidi restore the dumpfile.sql into it

4. Create a DBF from it using this code:
Code: Select all  Expand view  RUN
#include "FiveWin.ch"

request dbfcdx

function Main()

    local oCn := Maria_Connect( { "localhost", "forums", "username", "YourPassword" } )
    local cSQL

    TEXT INTO cSQL
    SELECT
        DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%Y-%m-%d') AS date,
        DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%H:%i') AS time,
        phpbb_forums.forum_name AS forum,
        phpbb_topics.topic_title AS topic,
        phpbb_users.username AS username,
        phpbb_posts.post_text AS text
    FROM
        phpbb_posts
    JOIN
        phpbb_users ON phpbb_posts.poster_id = phpbb_users.user_id
    JOIN
        phpbb_topics ON phpbb_posts.topic_id = phpbb_topics.topic_id
    JOIN
        phpbb_forums ON phpbb_posts.forum_id = phpbb_forums.forum_id;
    ENDTEXT

    oCn:SaveToDbf( cSQL, "posts.dbf" )

return nil    

5. Now we create a small dataset in json to make tests:
Code: Select all  Expand view  RUN
#include "FiveWin.ch"

request dbfcdx

function Main()

    local aPosts := {}, n

    USE posts VIA "dbfcdx"

    INDEX ON posts->topic + posts->date + posts->time + posts->forum TO subject
    GO TOP

    for n = 1 to 20
       AAdd( aPosts, GetTopic() )
    next
    hb_memoWrit( "forums.json", hb_jsonEncode( aPosts ) )
    XBrowser( aPosts )

return nil

function GetTopic()

    local hTopic := {=>}, cTopic := StrToUtf8( RTrim( posts->topic ) )

    hTopic[ "topic" ]    = StrToUtf8( RTrim( posts->topic )  )
    hTopic[ "messages" ] = {}

    AAdd( hTopic[ "messages" ], GetPost() )
    SKIP
    while posts->topic == cTopic
       AAdd( hTopic[ "messages" ], GetPost() )
       SKIP
    end

return hTopic    

function GetPost()

    local hPost := {=>}

    hPost[ "topic" ]    = StrToUtf8( RTrim( posts->topic ) )
    hPost[ "forum" ]    = StrToUtf8( RTrim( posts->forum ) )
    hPost[ "username" ] = StrToUtf8( RTrim( posts->username ) )
    hPost[ "date" ]     = posts->date
    hPost[ "time" ]     = RTrim( posts->time )
    hPost[ "text" ]     = StrToUtf8( posts->text )

return hPost    

#pragma BEGINDUMP

#include <windows.h>
#include <hbapi.h>

HB_FUNC( STRTOUTF8 )
{
   int iLength1;
   int iLength2;
   LPWSTR szWideText;
   char * szDest;

   iLength1 = MultiByteToWideChar( CP_ACP, 0, hb_parc( 1 ), hb_parclen( 1 ), NULL, 0 );
   szWideText = ( LPWSTR ) hb_xgrab( ( iLength1 + 1 ) * 2 );
   MultiByteToWideChar( CP_ACP, 0, hb_parc( 1 ), hb_parclen( 1 ), szWideText, iLength1 );
   szWideText[ iLength1 ] = NULL;
   iLength2 = WideCharToMultiByte( CP_UTF8, 0, szWideText, iLength1, NULL, 0, NULL, NULL );
   szDest = ( char * ) hb_xgrab( iLength2 + 1 );
   WideCharToMultiByte( CP_UTF8, 0, szWideText, iLength1, szDest, iLength2, NULL, NULL );
   hb_xfree( ( void * ) szWideText );
   szDest[ iLength2 ] = NULL;
   hb_retc( szDest );
   hb_xfree( ( void * ) szDest );
}    

#pragma ENDDUMP
 

6. next we load this forums.json as a dataset to HuggingFace to verify that it is correct. Open a free account at HuggingFace, create a dataset and upload forums.json. If you can properly inspect the forums.json from HuggingFace then it means that forums.json is ok.

The structure of the generated json file is as follows:
Code: Select all  Expand view  RUN
[
   {  "topic": the title of the topic,
      "messages":
      [
         {
            "topic": the title of the topic,
            "forum": the forum name,
            "username": name of the author,
            "date": date of the post,
            "time": time of the post,
            "text": text of the post
         },
        next posts for the same topic
      ]
   },
   next topic,
   ...
]

so basically it is a list of the topics, with the name of the topic and the list of messages for such topic.

Here you have the Google Colab file to run the fine tunning training from Microsoft Phi2:
https://github.com/FiveTechSoft/FWH_tools/blob/master/fivetech_forums.ipynb

======== original post

Some days ago our friend Bruno Cantero suggested me a great idea:

to generate an AI LLM from these forums, as these forums have been running for 18 years, so it may be a great dataset to train an AI LLM :-)

So first thing we need is to build a dataset from it. Here I am posting some initial tests that I expect that we may be able to complete with the help of Uwe and Rao:

topics.prg
Code: Select all  Expand view  RUN
#include "FiveWin.ch"

function Main()

    local cURL  := "http://forums.fivetechsupport.com/viewtopic.php?f3&t=8"
    local cHTML := WebPageContents( cUrl )
 
    MsgInfo( Len( GetTopics( cHtml ) ) )
 
return nil

function GetTopics( cHtml )

   local nAt, cTopic
   local aTopics := {}

   while ( nAt := At( 'class="post bg', cHtml ) ) != 0
      cTopic = SubStr( cHtml, nAt + Len( 'class="post bg' ) )
      cTopic = SubStr( cTopic, 1, At( '<hr class="divider"', cTopic ) + Len( '<hr class="divider"' ) )
      AAdd( aTopics, GetTopic( cTopic ) )
      cHtml = SubStr( cHtml, At( '<hr class="divider"', cHtml ) +  + Len( '<hr class="divider"' ) )
   end
   
return aTopics  

function GetTopic( cTopic )

   local hTopic := {=>}
   local cContent := SubStr( cTopic, At( '<div class="content">', cTopic ) + Len( '<div class="content">' ) )
   local cAuthor := SubStr( cTopic, At( '<p class="author">', cTopic ) + Len( '<p class="author">' ) )

   cContent = SubStr( cContent, 1, At( "</div>", cContent ) - 1 )

   cAuthor  = SubStr( cAuthor, 1, At( "</strong>", cAuthor ) - 5 )
   cAuthor  = SubStr( cAuthor, RAt( ">", cAuthor ) + 1 )

   MsgInfo( cContent, cAuthor )

   hTopic[ "contents" ] = cContent
   hTopic[ "author" ] = cAuthor

return hTopic  
 
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby Antonio Linares » Thu Dec 21, 2023 11:49 am

Another ideal use would be a DBF to LLM :-)

We already have llama64.dll to run it !
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby paquitohm » Thu Dec 21, 2023 12:22 pm

Antonio,

Pues me parece una buenisima idea.

Se podria interrogar a la IA sobre cual es el mejor vendedor, cuando vendio mas, que vendedor ha decrecido, etc

slds
paquitohm
 
Posts: 266
Joined: Fri Jan 14, 2022 8:37 am

Re: phpBB to LLM

Postby Jimmy » Thu Dec 21, 2023 12:27 pm

hi Antonio,
Antonio Linares wrote:Another ideal use would be a DBF to LLM :-)

We already have llama64.dll to run it !

i do have a DBF of Fivewin Forum

i use my phpBB "Codebox" Reader based on Idea of Uwe
https://www.hmgforum.com/viewtopic.php?t=7281

extract "Codebox" Tag is not my Problem, it are HTML "Sign" which i try to STRTRAN()
it work so far with CODE but in "Body" i still have a lot HTML "Sign"

Question : is there a Function HTML2TEXT() to get plain TEXT from HTML Message :?:
greeting,
Jimmy
User avatar
Jimmy
 
Posts: 1732
Joined: Thu Sep 05, 2019 5:32 am
Location: Hamburg, Germany

Re: phpBB to LLM

Postby Antonio Linares » Thu Dec 21, 2023 4:31 pm

paquitohm wrote:Antonio,

Pues me parece una buenisima idea.

Se podria interrogar a la IA sobre cual es el mejor vendedor, cuando vendio mas, que vendedor ha decrecido, etc

slds


totalmente de acuerdo. Tenemos que conseguirlo :-)
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby Marc Venken » Thu Dec 21, 2023 8:47 pm

I also generate a DBF based on Rao and Uwe code.

I only extract the source samples from all the posts. Offline I then search the dbf for keywords of any kind. It helped me many many times in finding solutions because It is showing sample code.
Marc Venken
Using: FWH 23.04 with Harbour
User avatar
Marc Venken
 
Posts: 1437
Joined: Tue Jun 14, 2016 7:51 am
Location: Belgium

Re: phpBB to LLM

Postby Antonio Linares » Fri Dec 22, 2023 6:43 am

This is the type of csv file that we have to generate from a DBF:

customer.csv
ID,FIRST,LAST,STREET,CITY,STATE,ZIP,HIREDATE,MARRIED,AGE,SALARY,NOTES
1,Homer,Simpson,32179 Maiden Lane,Springfield,IL,20503-8202,1992-09-18,True,50,5900.0,This is a test for record 1
2,Ceci,Gibbard,9540 Raynes Park Road,Miami,MA,55774-2304,1984-10-17,False,28,123700.0,This is a test for record 2
3,Reg,Kaczocha,30522 Park Ten Place,Scottsdale,WY,09226-1483,1989-05-23,True,43,82900.0,This is a test for record 3
4,David,Jochum,8211 Carnegie Center,Hingham,IL,71947-5114,1900-10-10,True,34,120000.0,This is a test for record 4
5,Simpson,Cafee,32736 Meadowbrook Drive,Nedlands,ID,38179-3789,1990-12-11,True,88,51800.0,This is a test for record 5
6,Tom,Logan,6180 Roselle Street,West Covina,CT,82378-0904,1992-02-24,True,90,20400.0,This is a test for record 6
7,Gary,Brock,3893 Canandaigua Road,Senford,WV,94177-5329,1987-09-12,True,58,145300.0,This is a test for record 7
8,Frank,Fonseca,18712 Sherman Way,Ashby,RI,08218-8409,1988-02-16,False,46,118900.0,This is a test for record 8
9,Rick,Sencovici,13802 South University,Arcadia,HI,82063-8091,1987-01-17,True,55,23700.0,This is a test for record 9
10,Hugh,Lupton,16472 S. LaSalle Street,Tarzana,AK,79021-0643,1989-08-28,False,89,96700.0,This is a test for record 10
11,_,Farley,19123 Washington Street,Boston,IN,25885-0851,1985-08-31,True,46,77300.0,This is a test for record 11
12,Johnny,Fischer,30621 Inridge Drive,McLean,WA,86275-8035,1988-11-12,False,37,2300.0,This is a test for record 12
13,Corkey,Young,9069 Avon Place,Lund,NC,36199-1793,1988-12-24,True,54,30000.0,This is a test for record 13
14,Phyllis,Lechuga,1457 Indianapolis Ave,Council Bluffs,AR,73036-5749,1987-01-29,False,94,84600.0,This is a test for record 14
15,Chester,Padilla,32385 Federal Street,Ashby,MS,82882-2447,1985-12-22,True,90,144000.0,This is a test for record 15
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby Otto » Fri Dec 22, 2023 9:31 am

Dear Antonio,

Can we download the DBF (phpBB) file from somewhere?

Best regards,

Otto
********************************************************************
mod harbour - Vamos a la conquista de la Web
modharbour.org
https://www.facebook.com/groups/modharbour.club
********************************************************************
User avatar
Otto
 
Posts: 6334
Joined: Fri Oct 07, 2005 7:07 pm

Re: phpBB to LLM

Postby VictorCasajuana » Fri Dec 22, 2023 9:47 am

Hola Antonio, tienes acceso a la base de datos del foro? en este caso sería más sencillo extraer la información.
--------
¿ Y porque no ?
¿ And why not ?
User avatar
VictorCasajuana
 
Posts: 265
Joined: Wed Mar 28, 2018 4:38 pm
Location: Vinaròs

Re: phpBB to LLM

Postby Antonio Linares » Fri Dec 22, 2023 10:22 am

Estimado Victor,

Si, claro, tenemos acceso a la base de datos.

De todas formas la cuestión ahora es como organizar esos datos para que puedan ser entrenados en un modelo de IA pre entrenado.

Por el momento hemos encontrado este modelo que nos da una primera idea:
data = {
'text': ['Initial post for programming question.', 'Reply 1: I think the issue is with your code.',
'Reply 2: Can you provide more details?', 'Initial post for hardware issue.',
'Reply 1: Have you checked the connections?', 'Reply 2: Try updating your drivers.'],
'label': ['programming', 'programming', 'programming', 'hardware', 'hardware', 'hardware']
}


poco a poco :-)
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby Antonio Linares » Fri Dec 22, 2023 10:24 am

Otto wrote:Dear Antonio,

Can we download the DBF (phpBB) file from somewhere?

Best regards,

Otto


Dear Otto,

We have not generated a DBF from these forums yet, but surely we will do it :-)
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby Antonio Linares » Fri Dec 22, 2023 1:32 pm

Dear Otto, Victor,

We already have a posts.dbf with all the forums posts :-)

We are reviewing if it has some private info that must not be shared...
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby Antonio Linares » Sat Dec 23, 2023 4:51 pm

This seems to be fine:

Code: Select all  Expand view  RUN
#include "FiveWin.ch"

request dbfcdx

function Main()

    local oCn := Maria_Connect( { "localhost", "forums", "root", "password" } )
    local cSQL

    TEXT INTO cSQL
    SELECT
        DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%Y-%m-%d') AS date,
        DATE_FORMAT(FROM_UNIXTIME(phpbb_posts.post_time), '%H:%i') AS time,
        phpbb_forums.forum_name AS forum,
        phpbb_topics.topic_title AS topic,
        phpbb_users.username AS username,
        phpbb_posts.post_text AS text
    FROM
        phpbb_posts
    JOIN
        phpbb_users ON phpbb_posts.poster_id = phpbb_users.user_id
    JOIN
        phpbb_topics ON phpbb_posts.topic_id = phpbb_topics.topic_id
    JOIN
        phpbb_forums ON phpbb_posts.forum_id = phpbb_forums.forum_id;
    ENDTEXT

    oCn:SaveToDbf( cSQL, "c:\temp\posts.dbf" )

return nil    
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby Antonio Linares » Sat Dec 23, 2023 5:22 pm

Here you have a posts.dbf and posts.fpt with all the contents of these forums :-)

We appreciate if you can review it and check if they are fine for you:
https://github.com/FiveTechSoft/forums/blob/master/posts_dbf.zip

join these parts using Total Commander so you get a posts_fpt.zip
https://github.com/FiveTechSoft/forums/blob/master/posts_fpt.001
https://github.com/FiveTechSoft/forums/blob/master/posts_fpt.002
https://github.com/FiveTechSoft/forums/blob/master/posts_fpt.003
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Re: phpBB to LLM

Postby Antonio Linares » Sat Dec 23, 2023 9:19 pm

First try building the dataset. hb_jsonEncode() GPFs...

Code: Select all  Expand view  RUN
#include "FiveWin.ch"

request dbfcdx

function Main()

    local aPosts := {}

    USE posts VIA "dbfcdx"

    INDEX ON posts->topic + posts->date + posts->time + posts->forum TO subject
    GO TOP

    while ! EoF()
       AAdd( aPosts, GetTopic() )
    end      

    // hb_jsonEncode( aPosts )
    MsgInfo( Len( aPosts ) )

return nil

function GetTopic()

    local hTopic := {=>}, cTopic := posts->topic

    hTopic[ "topic" ]    = posts->topic
    hTopic[ "messages" ] = {}

    AAdd( hTopic[ "messages" ], GetPost() )
    SKIP
    while posts->topic == cTopic
       AAdd( hTopic[ "messages" ], GetPost() )
       SKIP
    end

return hTopic    

function GetPost()

    local hPost := {=>}

    hPost[ "topic" ]    = posts->topic
    hPost[ "username" ] = posts->username
    hPost[ "text" ]     = posts->text

return hPost    
regards, saludos

Antonio Linares
www.fivetechsoft.com
User avatar
Antonio Linares
Site Admin
 
Posts: 42117
Joined: Thu Oct 06, 2005 5:47 pm
Location: Spain

Next

Return to FiveWin for Harbour/xHarbour

Who is online

Users browsing this forum: No registered users and 101 guests